In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [2]:
# считываем датасеты с нормированными описаниями тендеров
train_data_desc_stemmed = pd.read_csv('../data/intermid/train_data_desc_stemmed.csv')
test_data_desc_stemmed = pd.read_csv('../data/intermid/test_data_desc_stemmed.csv')

# соединяем датасеты
data = pd.concat([train_data_desc_stemmed, test_data_desc_stemmed], ignore_index=True)

In [3]:
print(data.shape)
data.head()

(862999, 2)


Unnamed: 0,pn_lot_id,text_description_tender_stemmed
0,7031618,услуга проведение финансовый аудит
1,7808247,приобретение приз проведение конкурс вопрос из...
2,7009496,приобретение приз проведение мероприятие избир...
3,5938735,продление лицензия битрикс управление сайт веб...
4,9327348,приобретение компьютерный техника машина вычис...


In [4]:
%%time
vectors = TfidfVectorizer().fit_transform(data.text_description_tender_stemmed) # строим тф-идф матрицу документ-слово
X_reduced = TruncatedSVD(n_components=100, random_state=0).fit_transform(vectors) # оставляем n главных компонент

Wall time: 1min 35s


In [5]:
%%time
vectors_1 = TfidfVectorizer(ngram_range=(2, 2)).fit_transform(data.text_description_tender_stemmed) # строим тф-идф матрицу документ-слово
X_reduced_1 = TruncatedSVD(n_components=100, random_state=0).fit_transform(vectors_1) # оставляем n главных компонент

Wall time: 8min 8s


In [6]:
# создаем датасет из идентификаторов и векторов и сохраняем в csv файл для дальнейшего использования
svd_vectors_redused  = pd.DataFrame(data['pn_lot_id']).join(pd.DataFrame(X_reduced))
svd_vectors_redused_1  = pd.DataFrame(data['pn_lot_id']).join(pd.DataFrame(X_reduced_1))

In [7]:
svd_vectors_redused

Unnamed: 0,pn_lot_id,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,7031618,0.073380,-0.056731,-0.045633,-0.084574,-0.069164,0.066576,-0.026270,-0.069280,-0.036473,...,-0.011532,0.009917,0.004976,-0.005292,-0.034566,0.018954,0.067127,0.062390,-0.066390,0.033915
1,7808247,0.047604,-0.020799,-0.005164,-0.000875,-0.031943,-0.010390,0.035026,0.028589,0.031976,...,-0.038640,-0.019499,0.001789,-0.045525,0.013005,0.003948,0.023157,0.011054,0.003779,0.014668
2,7009496,0.038434,-0.020887,0.002598,0.005830,-0.030764,-0.031133,0.035859,0.012698,0.037547,...,-0.005455,-0.015890,-0.023686,-0.017980,-0.014966,0.027483,0.027166,0.017373,0.027362,0.037736
3,5938735,0.070268,-0.034888,-0.019043,-0.060084,-0.094230,0.019838,-0.017156,-0.017957,0.090035,...,0.015021,-0.027357,0.020155,0.021619,0.004156,-0.023597,0.001860,-0.016468,-0.001139,-0.010066
4,9327348,0.128968,-0.064764,0.013677,0.063222,-0.121039,-0.161423,0.229098,0.144899,0.172583,...,-0.020119,0.009598,0.017670,-0.006916,-0.023590,0.063985,0.004921,0.021528,-0.029531,0.054680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
862994,5373821,0.122818,-0.077080,-0.031454,-0.050902,0.120885,-0.111679,-0.016274,0.000277,0.139890,...,-0.019912,-0.023850,0.012015,-0.017172,-0.016846,0.015756,0.005560,0.006304,0.060005,0.020579
862995,8396902,0.123394,-0.077275,-0.031489,-0.050778,0.121877,-0.111553,-0.016455,-0.000026,0.136316,...,-0.022289,-0.023968,0.019633,-0.014056,-0.013453,0.008801,0.006855,0.010563,0.056733,0.016638
862996,7569089,0.076051,-0.021969,0.007363,0.005460,-0.036076,-0.056971,-0.014552,-0.015000,0.077969,...,0.007075,0.019998,-0.023743,-0.003322,0.013769,-0.014706,0.020641,0.008812,0.022848,0.004639
862997,2403905,0.134301,-0.089409,-0.052109,-0.115832,0.058897,-0.082456,-0.022398,-0.020234,0.169255,...,-0.037382,0.014163,0.041581,0.012076,-0.072971,-0.028236,0.006528,-0.034705,0.001368,0.028676


In [8]:
svd_vectors_redused_1

Unnamed: 0,pn_lot_id,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,7031618,0.000103,0.000410,0.024214,0.000877,0.055451,-0.020267,-0.001789,-0.024950,-0.007987,...,0.062841,0.088314,-0.062334,-0.030441,0.032369,-0.096118,-0.142143,-0.053776,-0.179469,0.159360
1,7808247,0.000007,0.000008,0.000262,0.000045,0.000274,-0.000088,0.000023,-0.000175,0.000021,...,0.000108,0.001277,0.000869,0.000165,-0.000289,0.000524,0.000503,-0.000411,0.000366,0.000745
2,7009496,0.000012,0.000027,0.001353,0.000080,0.000291,-0.000161,0.000023,-0.000276,0.000912,...,0.000491,0.001206,-0.000807,-0.001985,-0.000046,0.000699,-0.001610,0.001607,0.001050,-0.001327
3,5938735,0.000111,0.000219,0.003915,0.000890,0.007365,-0.002334,0.003777,-0.003227,-0.001383,...,-0.043494,0.080305,-0.010154,-0.001914,-0.012898,0.034536,-0.019996,0.032964,-0.037014,0.020862
4,9327348,0.000118,0.000420,0.014132,0.005363,0.011435,-0.004156,0.003899,-0.013033,0.030464,...,-0.017188,-0.055838,-0.021248,0.028205,-0.003040,0.018078,0.007829,0.057615,-0.025547,-0.040214
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
862994,5373821,0.000426,0.003874,0.029996,0.000422,0.011275,0.013117,0.001935,-0.008270,0.058954,...,0.009217,-0.023388,-0.008746,0.065121,0.005347,-0.034169,-0.004398,-0.012610,0.002711,0.008568
862995,8396902,0.000421,0.003835,0.029602,0.000428,0.011103,0.012858,0.001911,-0.008190,0.058256,...,0.009040,-0.022887,-0.008717,0.064325,0.005167,-0.033814,-0.004295,-0.012948,0.002529,0.008575
862996,7569089,0.000355,0.002827,0.007325,0.000493,0.005010,-0.000385,0.002434,-0.003001,0.000553,...,0.002339,-0.003452,-0.001002,-0.004069,0.000396,-0.001200,-0.002168,-0.000463,-0.000120,0.001626
862997,2403905,0.000438,0.003756,0.016281,0.000567,0.015067,0.008030,0.002404,-0.007275,0.019650,...,0.022096,-0.000215,-0.002145,0.081529,0.024055,-0.000341,-0.038683,-0.008848,0.036105,-0.009803


In [9]:
svd_vectors_redused.to_csv('../data/intermid/svd_vectors_redused.csv', index=False, encoding='utf-8')
svd_vectors_redused_1.to_csv('../data/intermid/svd_vectors_ngrams_redused.csv', index=False, encoding='utf-8')

In [30]:
# svd_vectors_redused