# Text Analysis

Notebook by: Juan Shishido

In this notebook, I'll start cleaning the text columns and, more importantly, thinking about how to classify and group the data within them. Consider using n-grams for word occurence.

## Imports

In [1]:
import re
import pandas as pd
import numpy as np
from collections import Counter

## Load

In [2]:
df = pd.read_csv('../../data/cleaned/UCB_dept_merge.csv')

In [3]:
df.head()

Unnamed: 0,po_id,po_num,creation_date,supplier_name,item_type,product_description,manufacturer,quantity,unit_price,department,buyer__first_name,buyer__last_name,po_closed_date,department_name,spend
0,29847876,BB00195887,2013-05-31 00:00:00,GIVE SOMETHING BACK,SQ Hosted Product,"PORTABLE COMBINATION LAPTOP LOCK, 6 FT. CARBON...",KENSINGTON,30,24.32,,Dustin,Miller,2013-07-16 00:00:00,UIAPA UB Academic Year,795.26
1,29847864,BB00195886,2013-05-31 00:00:00,GRAINGER INC,PunchOut Product,"Wall Mount Fan, Oscillating, Number of Speeds ...",AIR KING,1,35.58,,Erin,Pinkston,2013-08-08 00:00:00,UKHDS Unit 1 Apt Admin,38.78
2,29847796,BB00195884,2013-05-31 00:00:00,BELLCO GLASS INC,NonCatalog Product,"Septum Stopper, 20mm Blue Butyl Rubber QtyPerC...",,1,77.41,,William,Wolf,,CCHEM RES Research,104.67
3,29847820,BB00195885,2013-05-31 00:00:00,CHEMGLASS LIFE SCIENCES LLC,NonCatalog Product,"Column, Chromatography, 24/40 Outer Joint, 100...",,1,108.0,,William,Wolf,2014-10-30 00:00:00,CCHEM RES Research,71.77
4,29847565,BB00195881,2013-05-31 00:00:00,FISHER SCIENTIFIC,SQ Hosted Product,"Bottles, Media/Lab; Wheaton;Graduated; With ru...",Wheaton Science Products Inc,1,135.38,,Donald C.,RIO,,IMMCB BH Research,317.31


## Transform

### Nan, Lowercase, Alphanumeric, Special Characters, Whitespace

In [4]:
cols = ['supplier_name', 'item_type', 'product_description', 'manufacturer', 'buyer__first_name', 'buyer__last_name', 'department_name']

In [5]:
for col in cols:
    df[col] = df[col].replace(np.nan, '' , regex=True)\
                .apply(lambda x: x.lower())\
                .apply(lambda x: re.sub('[^A-Za-z0-9.%]+', ' ', x))\
                .apply(lambda x: re.sub('^\.+', '', x))\
                .apply(lambda x: re.sub('^\/', '', x))\
                .apply(lambda x: re.sub('\s+', ' ', x).strip())

Use regular expression to keep only certain forward slashed (/), e.g., those that relate to fractions.

In [6]:
df.head()

Unnamed: 0,po_id,po_num,creation_date,supplier_name,item_type,product_description,manufacturer,quantity,unit_price,department,buyer__first_name,buyer__last_name,po_closed_date,department_name,spend
0,29847876,BB00195887,2013-05-31 00:00:00,give something back,sq hosted product,portable combination laptop lock 6 ft. carbon ...,kensington,30,24.32,,dustin,miller,2013-07-16 00:00:00,uiapa ub academic year,795.26
1,29847864,BB00195886,2013-05-31 00:00:00,grainger inc,punchout product,wall mount fan oscillating number of speeds 3 ...,air king,1,35.58,,erin,pinkston,2013-08-08 00:00:00,ukhds unit 1 apt admin,38.78
2,29847796,BB00195884,2013-05-31 00:00:00,bellco glass inc,noncatalog product,septum stopper 20mm blue butyl rubber qtyperca...,,1,77.41,,william,wolf,,cchem res research,104.67
3,29847820,BB00195885,2013-05-31 00:00:00,chemglass life sciences llc,noncatalog product,column chromatography 24 40 outer joint 100ml ...,,1,108.0,,william,wolf,2014-10-30 00:00:00,cchem res research,71.77
4,29847565,BB00195881,2013-05-31 00:00:00,fisher scientific,sq hosted product,bottles media lab wheaton graduated with rubbe...,wheaton science products inc,1,135.38,,donald c.,rio,,immcb bh research,317.31


## Exploratory

### Product Description

#### Unique Entries

In [7]:
unique_entries = df.groupby('product_description')['product_description'].count()

#### Unique Words

In [8]:
words = [w.strip().split(' ') for w in df.product_description.dropna().values]

In [9]:
word_list = [i for word in words for i in word]

In [10]:
word_counts = Counter(word_list)

In [11]:
top_100_words = word_counts.most_common(100)

In [12]:
for word in top_100_words:
    print word

('1', 179547)
('x', 115073)
('2', 106208)
('3', 78989)
('for', 77270)
('4', 69120)
('8', 66664)
('size', 65926)
('color', 64001)
('pack', 60019)
('and', 52647)
('black', 48431)
('to', 44800)
('5', 44732)
('in', 44415)
('of', 42642)
('in.', 41438)
('white', 40761)
('with', 37434)
('type', 37277)
('10', 35480)
('12', 34637)
('100', 31382)
('quantity', 31152)
('6', 29672)
('unit', 28454)
('11', 28070)
('length', 26176)
('paper', 25784)
('20', 24506)
('vwr', 23414)
('box', 23111)
('blue', 21241)
('ea', 20980)
('hp', 20972)
('cs', 19753)
('w', 19725)
('16', 19477)
('service', 18632)
('steel', 18151)
('order', 18092)
('50', 18019)
('7', 17557)
('pk', 17188)
('500', 17100)
('material', 17059)
('a', 16675)
('25', 16654)
('9', 16436)
('per', 16279)
('officemax', 16265)
('cartridge', 16099)
('request', 15855)
('toner', 15831)
('assorted', 15471)
('high', 14959)
('capacity', 14803)
('24', 14630)
('tube', 14493)
('sterile', 14328)
('clear', 14300)
('l', 14187)
('use', 13896)
('description', 13679)

## Bag of Words

### Words to Features

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
pd_list = []

for i in xrange(0, df.product_description.size):
    pd_list.append(df.product_description[i])

In [15]:
vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None)

In [16]:
word_features = vectorizer.fit_transform(pd_list).toarray()

In [17]:
word_features.shape

(611110, 165508)

In [18]:
word_features[0:5,:]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [19]:
vocab = vectorizer.get_feature_names()
print vocab[:15]

[u'00', u'000', u'0000', u'00000', u'000000', u'0000000302460', u'0000000546290', u'0000001', u'0000001108', u'0000001111806', u'0000001491308', u'0000001893108', u'00000020594506786cjk9d87', u'000000367', u'000000368']


In [20]:
vocab_map = vectorizer.vocabulary_
vocab_map

{u'tiebars': 149849,
 u'woodj': 163167,
 u'7aadbl': 46403,
 u'patchpad': 123802,
 u'woods': 163177,
 u'clotted': 72627,
 u'spiders': 143336,
 u'st500lt025': 144118,
 u'woody': 163185,
 u'universityplease': 158667,
 u'13n970': 10699,
 u'5988': 39930,
 u'canes': 68277,
 u'5982': 39923,
 u'5981': 39922,
 u'5986': 39926,
 u'5987': 39928,
 u'ch3ch2ch2c6h4oh': 70581,
 u'absrbt': 53354,
 u'heliothis': 96508,
 u'mshas4510': 115890,
 u'8mmh': 49158,
 u'ethylenedinitrilo': 86519,
 u'pigment': 126237,
 u'n2111': 116800,
 u'bringing': 64631,
 u'methylaniline': 112255,
 u'wooden': 163163,
 u'creles': 75419,
 u'wednesday': 162219,
 u'1080904': 6706,
 u'0000158978': 48,
 u'amplifications': 56447,
 u'gaskets': 92343,
 u'targt': 147826,
 u'atagcaacaagcttttagaactggtaaacgat': 58449,
 u'270': 23756,
 u'271': 23932,
 u'272': 24038,
 u'273': 24104,
 u'274': 24144,
 u'275': 24197,
 u'276': 24388,
 u'277': 24440,
 u'278': 24472,
 u'279': 24492,
 u'sebvb3130410': 139252,
 u'bottlepackage': 64112,
 u'g0026': 91

### Output

Trouble outputting file.

In [None]:
"""np.savetxt('../../data/cleaned/doc_term_matrix.csv', word_features, delimiter=',')"""

In [None]:
dtm = pd.HDFStore("doc_term_matrix.hdf")
dtm.append("a", pd.DataFrame(word_features))
dtm.close()

## LDA

In [None]:
import lda

In [None]:
X = word_features

In [None]:
"""model = lda.LDA(n_topics=50, n_iter=1000, random_state=1)
model.fit(X)"""

In [None]:
"""topic_word = model.topic_word_
n_top_words = 8
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))"""

## gensim

In [None]:
from gensim import models

In [None]:
"""lda = models.ldamodel.LdaModel(X, num_topics=100)"""