In [1]:
import pandas as pd
from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import re

In [2]:
# считываем данные
train_data = pd.read_csv("../data/external/train_data.csv", sep=';')
train_labels = pd.read_csv("../data/external/train_labels.csv", sep=';')
test_data = pd.read_csv("../data/external/test_data.csv", sep=';')

In [3]:
#Заполнение пропусков строкой 'None'
train_data = train_data.fillna('None')
test_data = test_data.fillna('None')

In [4]:
train_data['okpd2_or_additional_code'] = train_data[['okpd2_code', 'additional_code']].apply(lambda x: x[0] if x[1] == 'None' else x[1], axis=1)
test_data['okpd2_or_additional_code'] = test_data[['okpd2_code', 'additional_code']].apply(lambda x: x[0] if x[1] == 'None' else x[1], axis=1)

In [5]:
digits = re.compile("[0-9]+")
def digits_only(text):
    return int(''.join(digits.findall(text)))

In [6]:
%%time
# приведение pn_lot_anon к числовому виду, чтобы использовать в качестве уникального идентификатора
train_data['pn_lot_id'] = train_data.pn_lot_anon.apply(digits_only)
test_data['pn_lot_id'] = test_data.pn_lot_anon.apply(digits_only)

Wall time: 975 ms


In [7]:
# соединяем датасеты
data = pd.concat([train_data, test_data], ignore_index=True)
# data

In [9]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

In [17]:
# преобразовываем категориальный признак фз с помощью OneHotEncoding и сохраняем в csv файл для дальнейшего использования
ohe_fz = ohe.fit_transform(data[['fz']]).toarray()
ohe_fz = pd.DataFrame(data['pn_lot_id']).join(pd.DataFrame(ohe_fz, columns=ohe.categories_[0]))
ohe_fz

Unnamed: 0,pn_lot_id,223fz,44fz
0,7031618,0.0,1.0
1,7808247,0.0,1.0
2,7009496,0.0,1.0
3,5938735,0.0,1.0
4,9327348,0.0,1.0
...,...,...,...
862994,5373821,1.0,0.0
862995,8396902,1.0,0.0
862996,7569089,1.0,0.0
862997,2403905,1.0,0.0


In [10]:
ohe_fz.to_csv('../data/intermid/ohe_fz.csv', index=False, encoding='utf-8')

In [18]:
# преобразовываем категориальный признак региона с помощью OneHotEncoding и сохраняем в csv файл для дальнейшего использования
ohe_region = ohe.fit_transform(data[['region_code']]).toarray()
ohe_region = pd.DataFrame(data['pn_lot_id']).join(pd.DataFrame(ohe_region, columns=['region_'+item for item in ohe.categories_[0].astype(str)]))
ohe_region

Unnamed: 0,pn_lot_id,region_1,region_2,region_3,region_4,region_5,region_6,region_7,region_8,region_9,...,region_83,region_84,region_85,region_86,region_87,region_88,region_89,region_91,region_92,region_99
0,7031618,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7808247,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7009496,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5938735,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9327348,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
862994,5373821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
862995,8396902,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
862996,7569089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
862997,2403905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
ohe_region.to_csv('../data/intermid/ohe_region.csv', index=False, encoding='utf-8')

In [20]:
# преобразуем тип данных столбца 'okpd2_or_additional_code' в строку
data['okpd2_or_additional_code'] = data['okpd2_or_additional_code'].astype(str)

In [21]:
# преобразовываем категориальный признак ОКПД2 или дополнительный код с помощью OneHotEncoding и сохраняем в csv файл для дальнейшего использования
ohe_okpd2 = ohe.fit_transform(data[['okpd2_or_additional_code']]).toarray()
ohe_okpd2 = pd.DataFrame(data['pn_lot_id']).join(pd.DataFrame(ohe_okpd2, columns='code_'+ohe.categories_[0]))
ohe_okpd2

Unnamed: 0,pn_lot_id,code_01,code_01.1,code_01.2,code_01.3,code_01.4,code_01.5,code_01.6,code_01.7,code_02,...,code_95.2,code_96,code_96.0,code_97.0,code_98,code_98.1,code_99.0,code_F,code_None,code_drug
0,7031618,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7808247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7009496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5938735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9327348,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
862994,5373821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
862995,8396902,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
862996,7569089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
862997,2403905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
ohe_okpd2.to_csv('../data/intermid/ohe_okpd2.csv', index=False, encoding='utf-8')