In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import pickle

from math import ceil
from pylab import rcParams
from scipy import stats
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from inspect import getfullargspec

%matplotlib inline
warnings.filterwarnings('ignore')

plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = 16, 8

pd.options.display.max_columns = 100

In [36]:
df = pd.read_csv('train.csv', sep=';', on_bad_lines='skip')

df.sample(5)

Unnamed: 0,DateTime,Person_Id,Contract_Id,Account_Id,AccountPart_Id,Instrument_Id,Quantity,Number,ExternalAccount,Comments,ДатаДокумента,ВидДокумента,ВидОперации,НомерВходящегоДокумента,ДатаВходящегоДокумента,СуммаДокумента,ВалютаДокумента,СчетБанк,СчетОрганизации,СчетОрганизацииНомерСчета,СчетОрганизацииБанк,Контрагент,КонтрагентИНН,СчетКонтрагента,СчетКонтрагентаНомерСчета,СчетКонтрагентаБанк,СчетУчетаРасчетовСКонтрагентом,СубконтоДт1,СубконтоДт2,СубконтоДт3,СубконтоДт4,СубконтоДт5,СтатьяДвиженияДенежныхСредств,ЦеннаяБумага,НоменклатурнаяГруппа,НазначениеПлатежа,ДоговорКонтрагентаРасшифровкаПлатежа,СтатьяДвиженияДенежныхСредствРасшифровкаПлатежа,СуммаПлатежаРасшифровкаПлатежа,СтавкаНДСРасшифровкаПлатежа,СуммаНДСРасшифровкаПлатежа,СчетУчетаРасчетовСКонтрагентомРасшифровкаПлатежа,СчетУчетаРасчетовПоАвансамРасшифровкаПлатежа,СчетНаОплатуРасшифровкаПлатежа,НоменклатурнаяГруппаРасшифровкаПлатежа,ЦеннаяБумагаРасшифровкаПлатежа,СтрокаНайдена
9849,06.03.2024 0:00:00,2624,11628,104164,92911,10,1743,007-1#1,,INTEREST ...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Нет
3763,15.04.2024 0:00:00,2554,8857,20737,39385,5,1000000,01495,30414810000000000911,РК 00813. Возврат индивидуального клирингового...,15.04.2024 0:00:00,ПоступлениеНаРасчетныйСчет,Прочее поступление,1495.0,15.04.2024 0:00:00,1000000.0,руб.,55.04,ВК_НКО НРД 30411810000005000812 брок. торговый,30411810000005000812,044525505 НКО АО НРД,НКЦ НКО АО,7750004000.0,,,,76.09,НКЦ НКО АО,00813 от 21.03.2013,Акции,Денежные средства по акциям,,Прочие поступления,,Акции,РК 00813. Возврат индивидуального клирингового...,,Прочие поступления,1000000.0,,0.0,,,,Акции,,Да
14208,15.01.2024 0:00:00,2554,8498,60922,60930,5,400000,58,40701810501850000069,Перераспределение денежных средств для текущих...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Нет
14598,03.05.2024 0:00:00,13462,51623,61730,61405,5,46596,128,03100643000000018500,Единый налоговый платеж,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Нет
2864,23.04.2024 0:00:00,2554,8857,142,39363,5,15890,189,40701810000006000812,Оплата за депозитарные услуги с августа 2023 г...,23.04.2024 0:00:00,СписаниеСРасчетногоСчета,Перевод на другой счет организации,189.0,23.04.2024 0:00:00,15890.0,руб.,55.04,ВК_НКО НРД 810-0-6 Москвин Ю.В.,40701810000006000812,044525505 НКО АО НРД,,,ВК_НКО НРД 810-2-0 соб.,40701810200000000812,044525505 НКО АО НРД,55.04,ВК_НКО НРД 810-2-0 соб.,,,,,Комиссия депозитария,,,Оплата за депозитарные услуги с августа 2023 г...,,Комиссия депозитария,15890.0,,0.0,,,,,,Да


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15240 entries, 0 to 15239
Data columns (total 47 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   DateTime                                          15240 non-null  object 
 1   Person_Id                                         15240 non-null  int64  
 2   Contract_Id                                       15240 non-null  int64  
 3   Account_Id                                        15240 non-null  int64  
 4   AccountPart_Id                                    15240 non-null  int64  
 5   Instrument_Id                                     15240 non-null  int64  
 6   Quantity                                          15240 non-null  object 
 7   Number                                            15240 non-null  object 
 8   ExternalAccount                                   15107 non-null  object 
 9   Comments         

### Поля которые нужно заполнить.
* ВидДокумента
* ВидОперации
* Контрагент
* СчетУчетаРасчетовСКонтрагентом
* Субконто1…5
* СтатьяДвиженияДенежныхСредств
* ЦеннаяБумага
* НоменклатурнаяГруппа

In [38]:
data = df[~df["ВидДокумента"].isna()]

In [39]:
data.shape

(8273, 47)

In [40]:
def data_vectorizer(data, column, vectorizer=None):
    if vectorizer is None:
        vectorizer = TfidfVectorizer()
        # векторизация текстовых данных с помощью TF-IDF     
        X = vectorizer.fit_transform(data[column])   
    else:
        X = vectorizer.transform(data[column])   
    
    return pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names_out()), vectorizer    

In [41]:
def train_model(data, features, target):           
    X = data[features]
    y = data[target]
    
    model = LogisticRegression() 
    model.fit(X, y)    
    
    return model

In [54]:
# выполним label-encoding
encoder = LabelEncoder().fit(data['ВидДокумента'])
data['ВидДокументаEnc'] = encoder.transform(data['ВидДокумента'])
data_vectorize, vectorizer = data_vectorizer(data=data, column='Comments')
data = pd.concat([data, data_vectorize], axis=1)
data = data.fillna(data['ExternalAccount'].mode()[0]) 
data['ExternalAccountEnc'] = LabelEncoder().fit_transform(data['ExternalAccount'])
# делим набор для начала активного обучения
train_data, valid_data = train_test_split(data, train_size=0.2, random_state=42)
columns = list(data_vectorize.columns)
columns.append('Person_Id')
columns.append('Contract_Id')
columns.append('Account_Id')
columns.append('AccountPart_Id')
#columns.append('ExternalAccountEnc')
# обучение модели
model = train_model(train_data, columns, 'ВидДокументаEnc')
# формируем итоговый набор с предсказанными классами
X = valid_data[columns]
y_predicted = model.predict(X) 
auc = roc_auc_score(valid_data['ВидДокументаEnc'], y_predicted)
print(auc)
# результат
df = pd.concat([pd.DataFrame(encoder.inverse_transform(y_predicted), columns=['ВидДокумента']), valid_data], axis=1)
df.sample(10)

0.9554952316637763


Unnamed: 0,ВидДокумента,DateTime,Person_Id,Contract_Id,Account_Id,AccountPart_Id,Instrument_Id,Quantity,Number,ExternalAccount,Comments,ДатаДокумента,ВидДокумента.1,ВидОперации,НомерВходящегоДокумента,ДатаВходящегоДокумента,СуммаДокумента,ВалютаДокумента,СчетБанк,СчетОрганизации,СчетОрганизацииНомерСчета,СчетОрганизацииБанк,Контрагент,КонтрагентИНН,СчетКонтрагента,СчетКонтрагентаНомерСчета,СчетКонтрагентаБанк,СчетУчетаРасчетовСКонтрагентом,СубконтоДт1,СубконтоДт2,СубконтоДт3,СубконтоДт4,СубконтоДт5,СтатьяДвиженияДенежныхСредств,ЦеннаяБумага,НоменклатурнаяГруппа,НазначениеПлатежа,ДоговорКонтрагентаРасшифровкаПлатежа,СтатьяДвиженияДенежныхСредствРасшифровкаПлатежа,СуммаПлатежаРасшифровкаПлатежа,СтавкаНДСРасшифровкаПлатежа,СуммаНДСРасшифровкаПлатежа,СчетУчетаРасчетовСКонтрагентомРасшифровкаПлатежа,СчетУчетаРасчетовПоАвансамРасшифровкаПлатежа,СчетНаОплатуРасшифровкаПлатежа,НоменклатурнаяГруппаРасшифровкаПлатежа,ЦеннаяБумагаРасшифровкаПлатежа,СтрокаНайдена,ВидДокументаEnc,00,...,цод,части,частичная,частично,частичное,частичном,часть,через,черкизово,черногорэнерго,числе,чтпз,шашина,шд,шкафа,штрафа,ый,эйч,экзамена,экземпляров,эко,экспертиза,эл,электрической,электромагистраль,электронном,электронным,электронных,электрорешения,эм,энергетическая,энергоника,энергосбыт,энерготехсервис,эр,эс,эсэфай,эталон,эф,ювелит,югры,юл,юридические,якутия,якутская,январе,январь,январь2024,января,ятэк
2883,СписаниеСРасчетногоСчета,29.01.2024 0:00:00,2554.0,8857.0,143.0,39383.0,5.0,11900.0,46.0,40701810200000000812,Оплата возмещения расходов по банковской комис...,29.01.2024 0:00:00,СписаниеСРасчетногоСчета,Перевод на другой счет организации,46.0,29.01.2024 0:00:00,11900.0,руб.,55.04,ВК_НКО НРД 40701810500001000812 брок.,40701810500001000812,044525505 НКО АО НРД,40701810500001000812,40701810500001000812,ВК_НКО НРД 810-2-0 соб.,40701810200000000812,044525505 НКО АО НРД,55.04,40701810500001000812,40701810500001000812,40701810500001000812,40701810500001000812,40701810500001000812,Перевод ДС со счета на счет,40701810500001000812,40701810500001000812,Оплата возмещения расходов по банковской комис...,40701810500001000812,Перевод ДС со счета на счет,11900.0,20%,198333.0,40701810500001000812,40701810500001000812,40701810500001000812,40701810500001000812,40701810500001000812,Да,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2718,ПоступлениеНаРасчетныйСчет,06.02.2024 0:00:00,2554.0,8504.0,62438.0,61455.0,5.0,7900.0,14488.0,40703810210000000497,Оплата по счету 110 от 02.02.2024г. Услуги по ...,06.02.2024 0:00:00,СписаниеСРасчетногоСчета,Оплата поставщику,14488.0,06.02.2024 0:00:00,7900.0,руб.,51.0,ВК_СБЕРБАНК ПАО 810-1745,40701810838000001745,044525225 ПАО СБЕРБАНК,НАУФОР,7712088223.0,"р/c 40703810210000000497 ФИЛИАЛ ""ЦЕНТРАЛЬНЫЙ"" ...",40703810210000000497,"044525411 ФИЛИАЛ ""ЦЕНТРАЛЬНЫЙ"" БАНКА ВТБ (ПАО)",40701810500001000812,40701810500001000812,40701810500001000812,40701810500001000812,40701810500001000812,40701810500001000812,Оплата поставщикам Дт сч.60,40701810500001000812,Хозяйственные расходы,Оплата по счету 110 от 02.02.2024г. Услуги по ...,110 от 02.02.2024,Оплата поставщикам Дт сч.60,7900.0,Без НДС,0.0,60.01,60.02,40701810500001000812,Хозяйственные расходы,40701810500001000812,Да,1.0,0.095909,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.18361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.385963,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1249,СписаниеСРасчетногоСчета,05.02.2024 0:00:00,2554.0,8857.0,143.0,39383.0,5.0,1144419.0,5.0,40701810300010000812,Купонный доход по облигациям ПАО Группа компан...,05.02.2024 0:00:00,СписаниеСРасчетногоСчета,Перевод на другой счет организации,5.0,05.02.2024 0:00:00,1144419.0,руб.,55.04,ВК_НКО НРД 810-3-10 депозитарный,40701810300010000812,044525505 НКО АО НРД,40701810500001000812,40701810500001000812,ВК_НКО НРД 40701810500001000812 брок.,40701810500001000812,044525505 НКО АО НРД,55.04,40701810500001000812,40701810500001000812,40701810500001000812,40701810500001000812,40701810500001000812,Перевод ДС со счета на счет,40701810500001000812,40701810500001000812,Купонный доход по облигациям ПАО Группа компан...,40701810500001000812,Перевод ДС со счета на счет,1144419.0,20%,190737.0,40701810500001000812,40701810500001000812,40701810500001000812,40701810500001000812,40701810500001000812,Да,1.0,0.131291,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6484,СписаниеСРасчетногоСчета,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6644,,25.03.2024 0:00:00,2554.0,8504.0,207.0,38375.0,5.0,6500.0,14697.0,40817810138170911595,Отзыв денежных средств на основании Договора о...,25.03.2024 0:00:00,СписаниеСРасчетногоСчета,Прочее списание,14697.0,25.03.2024 0:00:00,6500.0,руб.,51.0,ВК_Тверское ОСБ 041 брокерский,40701810338040100041,044525225 ПАО СБЕРБАНК,Пресняков Сергей Андреевич,772704462418.0,40701810500001000812,40701810500001000812,40701810500001000812,76.09,Пресняков Сергей Андреевич,VC-BC-200821/02 от 21.08.2020,Акции,Денежные средства по акциям,40701810500001000812,Прочие расходы,40701810500001000812,Акции,Отзыв денежных средств на основании Договора о...,40701810500001000812,Прочие расходы,6500.0,20%,108333.0,40701810500001000812,40701810500001000812,40701810500001000812,Акции,40701810500001000812,Да,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1596,СписаниеСРасчетногоСчета,19.06.2024 0:00:00,2554.0,8857.0,20737.0,39385.0,5.0,500000.0,284.0,30414810000000000911,РК 00813. Возврат индивидуального клирингового...,19.06.2024 0:00:00,ПоступлениеНаРасчетныйСчет,Прочее поступление,284.0,19.06.2024 0:00:00,500000.0,руб.,55.04,ВК_НКО НРД 30411810000005000812 брок. торговый,30411810000005000812,044525505 НКО АО НРД,НКЦ НКО АО,7750004023.0,40701810500001000812,40701810500001000812,40701810500001000812,76.09,НКЦ НКО АО,00813 от 21.03.2013,Акции,Денежные средства по акциям,40701810500001000812,Прочие поступления,40701810500001000812,Акции,РК 00813. Возврат индивидуального клирингового...,40701810500001000812,Прочие поступления,500000.0,40701810500001000812,0.0,40701810500001000812,40701810500001000812,40701810500001000812,Акции,40701810500001000812,Да,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1076,СписаниеСРасчетногоСчета,30.05.2024 0:00:00,2554.0,8857.0,172.0,39401.0,5.0,39102.0,1.0,40701810500001000812,"Купонный доход по облигациям ПАО ""Группа ЛСР""....",30.05.2024 0:00:00,СписаниеСРасчетногоСчета,Перевод на другой счет организации,1.0,30.05.2024 0:00:00,39102.0,руб.,55.04,ВК_НКО НРД 810-3-10 депозитарный,40701810300010000812,044525505 НКО АО НРД,40701810500001000812,40701810500001000812,ВК_НКО НРД 40701810500001000812 брок.,40701810500001000812,044525505 НКО АО НРД,55.04,НРД НКО АО,VC-DC-151101/01 от 01.11.2015,Доходы по ценным бумагам,Купонный доход,40701810500001000812,Прочие расходы,40701810500001000812,Доходы по ценным бумагам,"Купонный доход по облигациям ПАО ""Группа ЛСР""....",40701810500001000812,Прочие расходы,39102.0,40701810500001000812,0.0,40701810500001000812,40701810500001000812,40701810500001000812,Доходы по ценным бумагам,40701810500001000812,Да,1.0,0.151318,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2715,СписаниеСРасчетногоСчета,11.04.2024 0:00:00,2554.0,8504.0,62438.0,61455.0,5.0,86093.0,14801.0,40702810238290014549,Оплата счета 24-0460К от 31.03.2024г.Сервисное...,11.04.2024 0:00:00,СписаниеСРасчетногоСчета,Оплата поставщику,14801.0,11.04.2024 0:00:00,86093.0,руб.,51.0,ВК_СБЕРБАНК ПАО 810-1745,40701810838000001745,044525225 ПАО СБЕРБАНК,ДИМИ-ДОС ООО,7719891912.0,р/с 40702810238290014549 в ПАО СБЕРБАНК,40702810238290014549,044525225 ПАО СБЕРБАНК,40701810500001000812,40701810500001000812,40701810500001000812,40701810500001000812,40701810500001000812,40701810500001000812,Оплата поставщикам Дт сч.60,40701810500001000812,Хозяйственные расходы,Оплата счета 24-0460К от 31.03.2024г.Сервисное...,24-0460К от 31.03.2024,Оплата поставщикам Дт сч.60,86093.0,20%,14349.0,60.01,60.02,40701810500001000812,Хозяйственные расходы,40701810500001000812,Да,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133123,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
701,СписаниеСРасчетногоСчета,18.04.2024 0:00:00,2554.0,8502.0,183.0,38391.0,5.0,100000.0,908.0,40701810850140856902,Перераспределение денежных средств. НДС не об...,18.04.2024 0:00:00,СписаниеСРасчетногоСчета,Перевод на другой счет организации,908.0,18.04.2024 0:00:00,100000.0,руб.,51.0,ВК_Промсвязьбанк 810-8-902 брок.,40701810850140856902,"044525555 ПАО ""ПРОМСВЯЗЬБАНК""",40701810500001000812,40701810500001000812,ВК_Промсвязьбанк 810-5-901,40701810550140856901,"044525555 ПАО ""ПРОМСВЯЗЬБАНК""",51,ВК_Промсвязьбанк 810-5-901,Перевод ДС со счета на счет,40701810500001000812,40701810500001000812,40701810500001000812,Перевод ДС со счета на счет,40701810500001000812,40701810500001000812,Перераспределение денежных средств. НДС не об...,40701810500001000812,Перевод ДС со счета на счет,100000.0,40701810500001000812,0.0,40701810500001000812,40701810500001000812,40701810500001000812,40701810500001000812,40701810500001000812,Да,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5966,ПоступлениеНаРасчетныйСчет,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [56]:
with open("modle.pkl", "wb") as f:
    pickle.dump(model, f)

In [58]:
with open("modle.pkl", "rb") as f:
    model = pickle.load(f)

https://habr.com/ru/articles/548910/