# Text Analysis

Notebook by: Juan Shishido

In this notebook, I'll start cleaning the text columns and, more importantly, thinking about how to classify and group the data within them. Consider using n-grams for word occurence.

## Imports

In [1]:
import re
import pandas as pd
import numpy as np
from collections import Counter

## Load

In [2]:
df = pd.read_csv('../../data/cleaned/UCB_dept_merge.csv')

In [3]:
df.head()

Unnamed: 0,po_id,po_num,creation_date,supplier_name,item_type,product_description,manufacturer,quantity,unit_price,department,buyer__first_name,buyer__last_name,po_closed_date,department_name,spend
0,29847876,BB00195887,2013-05-31 00:00:00,GIVE SOMETHING BACK,SQ Hosted Product,"PORTABLE COMBINATION LAPTOP LOCK, 6 FT. CARBON...",KENSINGTON,30,24.32,,Dustin,Miller,2013-07-16 00:00:00,UIAPA UB Academic Year,795.26
1,29847864,BB00195886,2013-05-31 00:00:00,GRAINGER INC,PunchOut Product,"Wall Mount Fan, Oscillating, Number of Speeds ...",AIR KING,1,35.58,,Erin,Pinkston,2013-08-08 00:00:00,UKHDS Unit 1 Apt Admin,38.78
2,29847796,BB00195884,2013-05-31 00:00:00,BELLCO GLASS INC,NonCatalog Product,"Septum Stopper, 20mm Blue Butyl Rubber QtyPerC...",,1,77.41,,William,Wolf,,CCHEM RES Research,104.67
3,29847820,BB00195885,2013-05-31 00:00:00,CHEMGLASS LIFE SCIENCES LLC,NonCatalog Product,"Column, Chromatography, 24/40 Outer Joint, 100...",,1,108.0,,William,Wolf,2014-10-30 00:00:00,CCHEM RES Research,71.77
4,29847565,BB00195881,2013-05-31 00:00:00,FISHER SCIENTIFIC,SQ Hosted Product,"Bottles, Media/Lab; Wheaton;Graduated; With ru...",Wheaton Science Products Inc,1,135.38,,Donald C.,RIO,,IMMCB BH Research,317.31


## Transform

### Nan, Lowercase, Alphanumeric, Special Characters, Whitespace

In [4]:
cols = ['supplier_name', 'item_type', 'product_description', 'manufacturer', 'buyer__first_name', 'buyer__last_name', 'department_name']

In [5]:
for col in cols:
    df[col] = df[col].replace(np.nan, '' , regex=True)\
                .apply(lambda x: x.lower())\
                .apply(lambda x: re.sub('[^A-Za-z0-9.%]+', ' ', x))\
                .apply(lambda x: re.sub('^\.+', '', x))\
                .apply(lambda x: re.sub('^\/', '', x))\
                .apply(lambda x: re.sub('\s+', ' ', x).strip())

Use regular expression to keep only certain forward slashed (/), e.g., those that relate to fractions.

In [6]:
df.head()

Unnamed: 0,po_id,po_num,creation_date,supplier_name,item_type,product_description,manufacturer,quantity,unit_price,department,buyer__first_name,buyer__last_name,po_closed_date,department_name,spend
0,29847876,BB00195887,2013-05-31 00:00:00,give something back,sq hosted product,portable combination laptop lock 6 ft. carbon ...,kensington,30,24.32,,dustin,miller,2013-07-16 00:00:00,uiapa ub academic year,795.26
1,29847864,BB00195886,2013-05-31 00:00:00,grainger inc,punchout product,wall mount fan oscillating number of speeds 3 ...,air king,1,35.58,,erin,pinkston,2013-08-08 00:00:00,ukhds unit 1 apt admin,38.78
2,29847796,BB00195884,2013-05-31 00:00:00,bellco glass inc,noncatalog product,septum stopper 20mm blue butyl rubber qtyperca...,,1,77.41,,william,wolf,,cchem res research,104.67
3,29847820,BB00195885,2013-05-31 00:00:00,chemglass life sciences llc,noncatalog product,column chromatography 24 40 outer joint 100ml ...,,1,108.0,,william,wolf,2014-10-30 00:00:00,cchem res research,71.77
4,29847565,BB00195881,2013-05-31 00:00:00,fisher scientific,sq hosted product,bottles media lab wheaton graduated with rubbe...,wheaton science products inc,1,135.38,,donald c.,rio,,immcb bh research,317.31


## Exploratory

### Product Description

#### Unique Entries

In [7]:
df.groupby('product_description')['product_description'].count()

product_description
                                                  7
0 12mm adapter conn.inlet a f                     1
0 12mm chem vac chem cap valve                    1
0 12mm chem vac plug                              1
0 2 c18 0 phosphatidylet200mg 1 ea                1
0 30 psi pressure gauge                           1
0 300 psi gauge                                   2
0 3000 psi gage block assembly                    1
0 3000v programmable power supply                 1
0 30v 0 3a variable benchtop power supply         2
0 4500 psi outlet 3 8 tube fitting regulator.     1
0 4mm chem cap plug with knob                     2
0 4mm chem vac chem cap valve                     2
0 4mm chem vac metering plug with control knob    1
0 4mm chem vac plug                               1
...
zyppy plasmid miniprep kit 400 preps unit same as d4020. please refer to quote 250028446 for discounted pricing    1
zyppy plasmid miniprep kit 50 preps promo code zr holiday       1
zyppy plasmid

#### Unique Words

In [8]:
words = [w.strip().split(' ') for w in df.product_description.dropna().values]

In [9]:
word_list = [i for word in words for i in word]

In [10]:
word_counts = Counter(word_list)

In [11]:
top_100_words = word_counts.most_common(100)

In [12]:
for word in top_100_words:
    print word

('1', 179547)
('x', 115073)
('2', 106208)
('3', 78989)
('for', 77270)
('4', 69120)
('8', 66664)
('size', 65926)
('color', 64001)
('pack', 60019)
('and', 52647)
('black', 48431)
('to', 44800)
('5', 44732)
('in', 44415)
('of', 42642)
('in.', 41438)
('white', 40761)
('with', 37434)
('type', 37277)
('10', 35480)
('12', 34637)
('100', 31382)
('quantity', 31152)
('6', 29672)
('unit', 28454)
('11', 28070)
('length', 26176)
('paper', 25784)
('20', 24506)
('vwr', 23414)
('box', 23111)
('blue', 21241)
('ea', 20980)
('hp', 20972)
('cs', 19753)
('w', 19725)
('16', 19477)
('service', 18632)
('steel', 18151)
('order', 18092)
('50', 18019)
('7', 17557)
('pk', 17188)
('500', 17100)
('material', 17059)
('a', 16675)
('25', 16654)
('9', 16436)
('per', 16279)
('officemax', 16265)
('cartridge', 16099)
('request', 15855)
('toner', 15831)
('assorted', 15471)
('high', 14959)
('capacity', 14803)
('24', 14630)
('tube', 14493)
('sterile', 14328)
('clear', 14300)
('l', 14187)
('use', 13896)
('description', 13679)

## Bag of Words

### Words to Features

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
pd_list = []

for i in xrange(0, df.product_description.size):
    pd_list.append(df.product_description[i])

In [15]:
vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None)

In [16]:
word_features = vectorizer.fit_transform(pd_list).toarray()

In [17]:
word_features.shape

(611110, 165508)

In [18]:
vocab = vectorizer.get_feature_names()
print vocab



## LDA