In [None]:
import urllib
import re
import sqlite3

In [None]:
#functions
def initDB():
    conn = sqlite3.connect('nordstrom.db')
    c = conn.cursor()
    c.execute('''CREATE TABLE nordstromW (Name,VendorName,Color,Price)''')
    conn.commit()
    conn.close()

def replaceChars(text):
    chars = [
        ['\xc3\xa9','e'],
        ['\xc3\xaa','e'],
        ['(&#174;)|(&#233;)|\'|\xc2\xae',''],
        ['(&amp;)|&','and'],
        ['[\/-]',' '],
        [' \(.*\)',''],
        ['  ',' '],
    ]
    
    for charSet in chars:
        text = re.sub(charSet[0],charSet[1],text)
    return(text)
initDB()

In [None]:
#gets information for item URLs on all of women's pages
class wPages:
    
    def __init__(self):
        self.WomenProductURL = "http://shop.nordstrom.com/c/all-womens-clothing?origin=leftnav"
        self.wProductsHTML   = self.collectCompressHTML(self.WomenProductURL)
        self.MaxPages        = self.extractMaxPageNumber()
        self.pageURLs        = self.pageURLext()
        self.itemAA          = []
        self.extractItems()
    
    def collectCompressHTML(self,URL):
        response = urllib.urlopen(URL)
        text = response.read()
        compressedHTML = re.sub(r"[\t\n\r]","",text.strip())
        return(compressedHTML)

    def extractMaxPageNumber(self):
        pageNums = re.findall('<ul class="product-results-pagination truncated-pagination">.*</nav>',self.wProductsHTML)
        pageNList = re.findall('data-page="(\d{1,3})"',pageNums[0])
        for i in range(0,len(pageNList)):
            pageNList[i] = int(pageNList[i])
        return(max(pageNList))

    def pageURLext(self):
        pageURLs = []
        for num in range(1,(self.MaxPages+1)):
            URL = 'http://shop.nordstrom.com/c/all-womens-clothing?origin=leftnav?sort=Feature&page=' + str(num)
            pageURLs.append(URL)
       
        return(pageURLs)

    ####get items and save URLs 
    def extractItems(self):
        #for page in range(0,len(self.pageURLs)):
        for page in [0,1,2]: #this is for testing. it takes the first two pages
            pageHTML = self.collectCompressHTML(self.pageURLs[page])
            extractedResults = re.findall('<!-- Begin FashionResults -->(.*)<!-- End FashionResults -->',pageHTML)[0]
            extractedURLs    = re.findall('href="(/s/[-a-z]*/\d{6,8}.origin=category)"',extractedResults)
            URLs = self.itemURLCreate(extractedURLs)
            self.itemAA.append(URLs)

    def itemURLCreate(self,itemExtensions):
        newURLs = []
        for i in itemExtensions:
            URL = 'http://shop.nordstrom.com' + i
            newURLs.append(URL)
    
        return(newURLs) 
    
            



In [None]:
class itemMD:
    
    def __init__(self,URL):
        self.URL        = URL
        self.HTML       = self.collectCompressHTML()
        self.vendorName = self.vendorExtract()
        self.itemName   = self.nameExtract()
        self.itemID     = self.itemNumberExtract()
        self.itemPrice  = self.priceExtract()
        self.itemColor  = self.colorExtracter()
        self.sqliteConn = sqlite3.connect('nordstromW.db')

        
    def collectCompressHTML(self):
        response = urllib.urlopen(self.URL)
        text = response.read()
        HTML = re.sub(r"[\t\n\r]","",text.strip())
        
        return(HTML)

    #extracts vendor name from item page
    def vendorExtract(self):
        vendorName = re.findall('<section id="brand-title".+><h2><a.+>(.*)</a></h2></section>',self.HTML)
        vendorName = replaceChars(vendorName[0])
        
        return(vendorName)
    
    #extracts names from item page
    def nameExtract(self):
        itemName = re.findall('<h1 itemprop="name">(.*)</h1></section',self.HTML)
        itemName = replaceChars(itemName[0])
        
        return(itemName)
    
    #extract regular price from item page
    def priceExtract(self):
        itemPrice = re.findall('regularPrice":"\$([\d,]{1,5}\.\d{2})"',self.HTML)
        try:
            return(itemPrice[0])
        except:
            return(0)
    
    #extracts item number from item page
    def itemNumberExtract(self):
        itemID = re.findall('<div class="item-number-wrapper">Item #(\d+)</div>',self.HTML)
        if not itemID:
            itemID = ['']
        return(itemID)
    
    #extracts color from item page
    def colorExtracter(self):
        colorElement = re.findall('<option (selected="\w{0,20}" )?value="color-\d{6,8}">([\w ]+)</option>',self.HTML)
        colorElement = re.findall('value="color-\d{6,8}">([\w ]+)</option>',self.HTML)
        colorList = []
        for color in colorElement:
            if ( color in colorList ):
                next
            else:
                colorList.append(color)
                
        if not colorList:
            colorList = [''] 
        return(colorList)
    
    
    #exports item properties as a string
    def export(self):
        return([self.itemName,self.vendorName,self.itemID,self.itemPrice,self.itemColor])
            
    def dbPopulate(self,db):
        conn = sqlite3.connect(db)
        c = conn.cursor()
        
        for color in self.itemColor:
            record = [self.itemName,self.vendorName,color,self.itemPrice]
            c.execute("INSERT INTO nordstromW VALUES (?,?,?,?)",record)
        
        conn.commit()
        conn.close()
            

In [None]:
nordstromWomen = wPages()

In [None]:
for array in nordstromWomen.itemAA:
    for item in array:
        testIT = itemMD(item)
        testIT.dbPopulate('nordstrom.db')