In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import numpy as np
import pandas as pd
import json
import time

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from itertools import islice

In [3]:
datapath=''

In [4]:
df=pd.read_hdf(datapath+'df.h5', 'table')

In [5]:
desc=df['comb_desc']

In [6]:
start_time = time.time()
#counts word frequencies
word_counts = CountVectorizer(stop_words='english', 
                              analyzer='word',
                              min_df=.0025, #max_df=.1, 
                              ngram_range=(1,1))
word_counts.fit(desc)
print("--- %.2f seconds ---" % (time.time() - start_time))

--- 29.49 seconds ---


In [7]:
# Check how many total 1-grams we have across all product descs
len(word_counts.vocabulary_)

list(islice(sorted(word_counts.vocabulary_.items(), reverse=True), 20))

[('zippered', 1072),
 ('zipper', 1071),
 ('zip', 1070),
 ('youthful', 1069),
 ('york', 1068),
 ('yoke', 1067),
 ('yellow', 1066),
 ('year', 1065),
 ('wrap', 1064),
 ('worn', 1063),
 ('world', 1062),
 ('workout', 1061),
 ('work', 1060),
 ('woolblend', 1059),
 ('wool', 1058),
 ('womens', 1057),
 ('women', 1056),
 ('woman', 1055),
 ('wish', 1054),
 ('winter', 1053)]

In [8]:
wordcounts = word_counts.transform(df['comb_desc'])
print('sparse matrix shape:', wordcounts.shape)
print('nonzero count:', wordcounts.nnz)
print('sparsity: %.2f%%' % (100.0 * wordcounts.nnz / (wordcounts.shape[0] * wordcounts.shape[1])))

sparse matrix shape: (1072150, 1073)
nonzero count: 20649077
sparsity: 1.79%


In [9]:
# top 15 most common terms
occ = np.asarray(wordcounts.sum(axis=0)).ravel().tolist()
counts_desc = pd.DataFrame({'term': word_counts.get_feature_names(), 'occurrences': occ})
counts_desc.sort_values(by='occurrences', ascending=False).head(15).T

Unnamed: 0,261,843,69,189,691,829,338,113,151,660,1070,539,812,914,78
term,dress,sleeve,black,cotton,pocket,size,fit,button,closure,pant,zip,long,short,style,blue
occurrences,730867,425240,420710,403715,336093,334761,314197,274528,267543,262793,259659,257597,239232,238716,232741


In [10]:
start_time = time.time()
transformer = TfidfTransformer()
transformed_weights = transformer.fit_transform(wordcounts)
print("--- %.2f seconds ---" % (time.time() - start_time))

--- 0.61 seconds ---


In [11]:
# top 15 terms by average tf-idf weight:
weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': word_counts.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(15).T

Unnamed: 0,261,69,189,843,660,691,812,928,78,836,1049,808,113,829,151
term,dress,black,cotton,sleeve,pant,pocket,short,sweater,blue,skirt,white,shirt,button,size,closure
weight,0.0715857,0.0440156,0.0357701,0.035391,0.0326948,0.0303458,0.0299723,0.0292727,0.0290085,0.0279998,0.0279041,0.0275573,0.0273254,0.0267282,0.0267073


In [12]:
weights_df.to_hdf('tfidf_weights_df.h5','table')