# Text Analysis

Notebook by: Juan Shishido

In this notebook, I'll start cleaning the text columns and, more importantly, thinking about how to classify and group the data within them. Consider using n-grams for word occurence.

## Imports

In [1]:
import re
import random
import lda
import csv
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

## Load

In [2]:
df = pd.read_csv('../../data/cleaned/UCB_dept_merge.csv')

### Random Sample: 20,000

In [3]:
random.seed(8675309)
rows = random.sample(df.index, 20000)
df = df.ix[rows].reset_index()

In [4]:
df.head()

Unnamed: 0,index,po_id,po_num,creation_date,supplier_name,item_type,product_description,manufacturer,quantity,unit_price,department,buyer__first_name,buyer__last_name,po_closed_date,department_name,spend
0,245817,35963640,BB00319075,2014-02-11 00:00:00,FISHER SCIENTIFIC,SQ Hosted Product,Molecular BioProducts Electroporation Cuvettes...,Molecular BioProducts,2,109.07,,Jon,Kuchenreuther,2014-07-16 00:00:00,OOEBI Depolymerization,767.03
1,311817,27067475,BB00134494,2013-02-04 00:00:00,OFFICE MAX,SQ Hosted Product,"Tablemate - Balloons, Color Assorted",TABLE MATE PRODUCTS INC,1,16.81,,William,Wolf,2013-04-22 00:00:00,OOEBI Depolymerization,18.32
2,405620,30901583,BB00218407,2013-07-16 00:00:00,GRAINGER INC,PunchOut Product,"CS2460120 Split Lock Washer, Spring Steel, Zin...",GRAINGER APPROVED VENDOR,1,2.46,,MIKE,COURTER,2013-10-02 00:00:00,FJPPS BM Plumbing Shop,1890.47
3,525992,24759795,BB00085067,2012-10-11 00:00:00,Thorlabs Inc,NonCatalog Product,"SM05 Threaded Kinematic Cage Mount O1/2"" Optics.",Thorlabs,1,82.0,,Michael,Hohensee,2013-03-01 00:00:00,PHYSI SS Atomic & Molecular,4783.54
4,171686,35510783,BB00309841,2014-01-24 00:00:00,FISHER SCIENTIFIC,NonCatalog Product,10 mL serological pipettes 200/case,,3,29.2,,Philip,Nguyen,2014-10-30 00:00:00,IMMCB BH Research,745.2


## Transform

### Nan, Lowercase, Alphanumeric, Special Characters, Whitespace

In [5]:
cols = ['supplier_name', 'item_type', 'product_description', 'manufacturer', 
        'buyer__first_name', 'buyer__last_name', 'department_name']

In [6]:
for col in cols:
    df[col] = df[col].replace(np.nan, '' , regex=True)                                      \
                     .apply(lambda x: x.lower())                                            \
                     .apply(lambda x: re.sub('(http\S*|www\S*)', '', x))                    \
                     .apply(lambda x: re.sub('((?<=\D)/|/(?=\D))', ' ', x))                 \
                     .apply(lambda x: re.sub('[^A-Za-z0-9.%\/]+', ' ', x))                  \
                     .apply(lambda x: re.sub('\.+', '', x))                                 \
                     .apply(lambda x: re.sub('(?<=\s)\w(?=\s)|(?<=\s)\d(?=\s)', '', x))     \
                     .apply(lambda x: re.sub('\s+', ' ', x).strip())

In [7]:
df.head()

Unnamed: 0,index,po_id,po_num,creation_date,supplier_name,item_type,product_description,manufacturer,quantity,unit_price,department,buyer__first_name,buyer__last_name,po_closed_date,department_name,spend
0,245817,35963640,BB00319075,2014-02-11 00:00:00,fisher scientific,sq hosted product,molecular bioproducts electroporation cuvettes...,molecular bioproducts,2,109.07,,jon,kuchenreuther,2014-07-16 00:00:00,ooebi depolymerization,767.03
1,311817,27067475,BB00134494,2013-02-04 00:00:00,office max,sq hosted product,tablemate balloons color assorted,table mate products inc,1,16.81,,william,wolf,2013-04-22 00:00:00,ooebi depolymerization,18.32
2,405620,30901583,BB00218407,2013-07-16 00:00:00,grainger inc,punchout product,cs2460120 split lock washer spring steel zinc ...,grainger approved vendor,1,2.46,,mike,courter,2013-10-02 00:00:00,fjpps bm plumbing shop,1890.47
3,525992,24759795,BB00085067,2012-10-11 00:00:00,thorlabs inc,noncatalog product,sm05 threaded kinematic cage mount o1/2 optics,thorlabs,1,82.0,,michael,hohensee,2013-03-01 00:00:00,physi ss atomic molecular,4783.54
4,171686,35510783,BB00309841,2014-01-24 00:00:00,fisher scientific,noncatalog product,10 ml serological pipettes 200 case,,3,29.2,,philip,nguyen,2014-10-30 00:00:00,immcb bh research,745.2


## Exploratory

### Product Description

#### Unique Entries

In [8]:
unique_entries = df.groupby('product_description')['product_description'].count()

#### Unique Words

In [9]:
words = [w.strip().split(' ') for w in df.product_description.dropna().values]
word_list = [i for word in words for i in word]
word_counts = Counter(word_list)
top_100_words = word_counts.most_common(100)
for word in top_100_words:
    print word

('in', 2810)
('for', 2480)
('size', 2219)
('color', 2121)
('pack', 1991)
('and', 1655)
('black', 1594)
('1/2', 1545)
('to', 1447)
('of', 1393)
('white', 1359)
('type', 1272)
('with', 1203)
('10', 1135)
('100', 1080)
('quantity', 1067)
('12', 1017)
('unit', 990)
('length', 903)
('paper', 843)
('11', 795)
('box', 793)
('20', 788)
('vwr', 748)
('blue', 738)
('cs', 691)
('ea', 677)
('pk', 668)
('hp', 667)
('1/4', 637)
('25', 606)
('service', 602)
('steel', 595)
('50', 591)
('order', 591)
('500', 569)
('officemax', 566)
('material', 559)
('3/4', 543)
('per', 531)
('assorted', 528)
('cartridge', 518)
('request', 512)
('lb', 509)
('tube', 501)
('toner', 488)
('no', 487)
('use', 484)
('capacity', 483)
('high', 474)
('sterile', 472)
('medium', 461)
('clear', 460)
('free', 460)
('24', 451)
('cap', 443)
('point', 440)
('description', 438)
('the', 433)
('tip', 431)
('style', 423)
('non', 418)
('000', 409)
('ink', 405)
('yellow', 404)
('head', 403)
('tips', 390)
('dia', 388)
('sheets', 384)
('stand

## Bag of Words

### Words to Features

In [10]:
pd_list = []
for i in xrange(0, df.product_description.size):
    pd_list.append(df.product_description[i])

In [11]:
vectorizer = CountVectorizer(analyzer = "word", 
                             tokenizer = None, 
                             preprocessor = None, 
                             stop_words = 'english')

In [12]:
word_features = vectorizer.fit_transform(pd_list).toarray()

In [13]:
word_features.shape

(20000, 25691)

In [14]:
word_features[0:5,:]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [15]:
vocab = vectorizer.get_feature_names()
print vocab[:15]

[u'00', u'000', u'0000', u'00000', u'00000367', u'00000369', u'00000current', u'0000124', u'00003', u'00006y5y40073', u'00006y5y40083', u'00006y5y40093', u'00007052period', u'00007958', u'00008']


In [16]:
vocab_map = vectorizer.vocabulary_
vocab_map

{u'hcho': 14219,
 u'awgkcmil': 7700,
 u'e2313h': 11976,
 u'autosamplers': 7668,
 u'leuconostoc': 16038,
 u'431033promo': 4329,
 u'woods': 25362,
 u'hanging': 14163,
 u'maxbeam': 16723,
 u'phenylpropylamino': 19187,
 u'shure': 21949,
 u'endoglin': 12326,
 u'7446period': 5803,
 u'inventorial': 15154,
 u'59mm': 5129,
 u'hermann': 14332,
 u'meadows': 16797,
 u'cytochrome': 10876,
 u'cyclodextrin': 10848,
 u'propane': 20002,
 u'bookeye': 8429,
 u'120125': 1195,
 u'16id7': 1852,
 u'lumin': 16340,
 u'hitchcock': 14432,
 u'cholestech': 9748,
 u'celite': 9449,
 u'wurster': 25435,
 u'fm300': 13201,
 u'c1823d': 8827,
 u'interlocking': 15123,
 u'oaklanddrop': 18190,
 u'cynomolgus': 10869,
 u'methylaniline': 16963,
 u'basics': 7956,
 u'27374': 3097,
 u'wooden': 25360,
 u'wednesday': 25196,
 u'209538000': 2408,
 u'naturallyspeaking': 17745,
 u'f25t8': 12736,
 u'330mm': 3644,
 u'99999': 6522,
 u'99998': 6521,
 u'0059': 144,
 u'specially': 22396,
 u'gaskets': 13545,
 u'targt': 23353,
 u'0051': 139,
 u

### Output

Trouble outputting file.

In [17]:
"""np.savetxt('../../data/cleaned/doc_term_matrix.csv', word_features, delimiter=',')"""

"np.savetxt('../../data/cleaned/doc_term_matrix.csv', word_features, delimiter=',')"

In [18]:
"""dtm = pd.HDFStore("../../data/cleaned/doc_term_matrix.hdf")
dtm.append("a", pd.DataFrame(word_features))
dtm.close()"""

'dtm = pd.HDFStore("../../data/cleaned/doc_term_matrix.hdf")\ndtm.append("a", pd.DataFrame(word_features))\ndtm.close()'

## LDA

In [19]:
X = word_features

In [20]:
model = lda.LDA(n_topics=100, n_iter=1500, random_state=8675309)
model.fit(X)



<lda.lda.LDA instance at 0x107a55488>

In [21]:
topic_word = model.topic_word_
n_top_words = 11

with open('../../results/topic_definitions.csv', 'wb') as to_:
    writer = csv.writer(to_, delimiter=',', quotechar='\"')
    doc_topic = model.doc_topic_
    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
        writer.writerow([i, ' '.join(topic_words)])

In [22]:
with open('../../results/pd_topics.csv', 'wb') as to_:
    writer = csv.writer(to_, delimiter=',', quotechar='\"')
    doc_topic = model.doc_topic_
    for i in range(len(pd_list)):
        writer.writerow([pd_list[i], doc_topic[i].argmax()])

## gensim

In [None]:
from gensim import models

In [None]:
"""lda = models.ldamodel.LdaModel(X, num_topics=100)"""