# Yelp Data Mining Project
## CMPE - 255 Data Mining Fall 2017
### Group 6
- Dhrumil Shah
- Nishant Rathi
- Rashmi Sharma

## Notebook to Preprocess Data and create CSR

In this Jupyter Notebook, we are loading Test and training data set we created from yelp data set.
This data is cleansed and converted to CSR Matrix. This CSR Matrix is huge hence storing it as pickle for further processing.

In [8]:
import re
import numpy as np
import scipy as sp
from collections import Counter
from scipy.sparse import csr_matrix
import cPickle as pickle
from collections import defaultdict

## This block have functions for:

- clean - remove punctuations, small words and convert allwords to lower
- kmers - form kmers by grouping 2 and 3 words
- csr - form csr matrix, idf and l2 norm of csr matrix
- pickle - to write to a pickle and read from a pickle

In [2]:
def clean(raw):
  cleanr = re.compile('<.*?>')
  cleantext = re.sub(cleanr, ' ', raw)
  cleanr = re.compile('[^a-zA-Z0-9]')
  cleantext = re.sub(cleanr, ' ', cleantext)
  cleanr = re.compile(r'\W*\b\w{1,2}\b')
  cleantext = re.sub(cleanr, '', cleantext)
  return cleantext.lower()

def group(inp, n = 2):
    for i in xrange(len(inp) - (n - 1)):
        yield inp[i:i+n]

def group2words(inp):
    comb_2_words = []
    for f, s in group(inp, 2):
        comb = f + " "+s
        comb_2_words.append(comb)
    return comb_2_words

def group3words(inp):
    comb_3_words = []
    for f, s, t in group(inp, 3):
        comb = f + " "+s + " "+t
        comb_3_words.append(comb)
    return comb_3_words

def getKmers(inp):
    kmers=[]
    comb_2_words = group2words(inp)
    for comb in comb_2_words:
        kmers.append(comb)
    comb_3_words = group3words(inp)
    for comb in comb_3_words:
        kmers.append(comb)
    return kmers

def build_matrix(docs):
    r""" Build sparse matrix from a list of documents,
    each of which is a list of word/terms in the document.
    """
    nrows = len(docs)
    idx = {}
    tid = 0
    nnz = 0
    # Remove all ratings
    for d in docs:
        #d = d[1:]
        nnz += len(set(d))
        for w in d:
            if w not in idx:
                idx[w] = tid
                tid += 1
    ncols = len(idx)

    # set up memory
    ind = np.zeros(nnz, dtype=np.int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows+1, dtype=np.int)
    i = 0  # document ID / row counter
    n = 0  # non-zero counter
    # transfer values
    for d in docs:
        #d = d[1:]
        cnt = Counter(d)
        keys = list(k for k,_ in cnt.most_common())
        l = len(keys)
        for j,k in enumerate(keys):
            ind[j+n] = idx[k]
            val[j+n] = cnt[k]
        ptr[i+1] = ptr[i] + l
        n += l
        i += 1

    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
    mat.sort_indices()

    return mat


# scale matrix and normalize its rows
def csr_idf(mat, copy=False, **kargs):
    r""" Scale a CSR matrix by idf. 
    Returns scaling factors as dict. If copy is True, 
    returns scaled matrix and scaling factors.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # document frequency
    df = defaultdict(int)
    for i in ind:
        df[i] += 1
    # inverse document frequency
    for k,v in df.items():
        df[k] = np.log(nrows / float(v))  ## df turns to idf - reusing memory
    # scale by idf
    for i in range(0, nnz):
        val[i] *= df[ind[i]]
        
    return df if copy is False else mat

def csr_l2normalize(mat, copy=False, **kargs):
    r""" Normalize the rows of a CSR matrix by their L-2 norm. 
    If copy is True, returns a copy of the normalized matrix.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # normalize
    for i in range(nrows):
        rsum = 0.0    
        for j in range(ptr[i], ptr[i+1]):
            rsum += val[j]**2
        if rsum == 0.0:
            continue  # do not normalize empty rows
        rsum = 1.0/np.sqrt(rsum)
        for j in range(ptr[i], ptr[i+1]):
            val[j] *= rsum
            
    if copy is True:
        return mat

def save_pickle(matrix, filename):
    with open(filename, 'wb') as outfile:
        pickle.dump(matrix, outfile, pickle.HIGHEST_PROTOCOL)

def load_pickle(filename):
    with open(filename, 'rb') as infile:
        matrix = pickle.load(infile)
    return matrix

## Loading Training Data

In [11]:
with open("data/train_final.dat", "r") as fh:
    lines = fh.readlines()  

In [12]:
userid = []
rating = []
docs = []
i = 0
j = 0
error_line_num = []
error_lines = []
for line in lines:
    try:
        i = i + 1
        l = line.split('\t', 3)
        userid.append(l[0])
        rating.append(l[1])
        d = clean(l[2]).split()
        kmers = getKmers(d)
        d.extend(kmers)
        docs.append(d)
    except Exception as e:
        j = j + 1
        error_line_num.append(i)
        error_lines.append(line)
        print e

print 'Training Data: Number of lines processed: ' + str(i)
print 'Training Data: Length of userid array: ' + str(len(userid))
print 'Training Data: Length of rating array: ' + str(len(rating))
print 'Training Data: Length of docs array: ' + str(len(docs))
print 'Training Data: Number of exceptions encountered: ' + str(j)


Training Data: Number of lines processed: 78790
Training Data: Length of userid array: 78790
Training Data: Length of rating array: 78790
Training Data: Length of docs array: 78790
Training Data: Number of exceptions encountered: 0


## Loading Test Data

In [3]:
with open("data/test_final.dat", "r") as fh:
    test_lines = fh.readlines() 

In [4]:
test_userid = []
test_rating = []
test_docs = []
test_i = 0
test_j = 0
test_error_line_num = []
test_error_lines = []
for line in test_lines:
    try:
        test_i = test_i + 1
        l = line.split('\t', 3)
        test_userid.append(l[0])
        test_rating.append(l[1])
        d = clean(l[2]).split()
        kmers = getKmers(d)
        d.extend(kmers)
        test_docs.append(d)
    except Exception as e:
        test_j = test_j + 1
        test_error_line_num.append(test_i)
        test_error_lines.append(line)
        print e

print 'Testing Data: Number of lines processed: ' + str(test_i)
print 'Testing Data: Length of userid array: ' + str(len(test_userid))
print 'Testing Data: Length of rating array: ' + str(len(test_rating))
print 'Testing Data: Length of docs array: ' + str(len(test_docs))
print 'Testing Data: Number of exceptions encountered: ' + str(test_j)

Testing Data: Number of lines processed: 21210
Testing Data: Length of userid array: 21210
Testing Data: Length of rating array: 21210
Testing Data: Length of docs array: 21210
Testing Data: Number of exceptions encountered: 0


## Create CSR Matrix for Training data

In [13]:
# For Training data
csr_mat = build_matrix(docs)
print 'Training CSR Formed'
mat1 = csr_idf(csr_mat, copy=True)
print 'Training IDF Formed'
docs_csr = csr_l2normalize(mat1, copy=True)
print 'Training L2Norm Formed'

Training CSR Formed
Training IDF Formed
Training L2Norm Formed


## Save Training Data into pickle

In [14]:
filename = 'pickle/training/'
save_pickle(userid, filename+'userid.pickle')
save_pickle(rating, filename+'rating.pickle')
#save_pickle(docs, filename+'docs.pickle')
save_pickle(docs_csr, filename+'docs_csr.pickle')
print 'Training Data saved into pickle'

Training Data saved into pickle


## Create CSR Matrix for Testing data

In [9]:
# For Testing data
csr_mat2 = build_matrix(test_docs)
print 'Testing CSR Formed'
mat2 = csr_idf(csr_mat2, copy=True)
print 'Testing IDF Formed'
test_docs_csr = csr_l2normalize(mat2, copy=True)
print 'Testing L2Norm Formed'

Testing CSR Formed
Testing IDF Formed
Testing L2Norm Formed


## Save Testing Data into pickle

In [10]:
filename = 'pickle/testing/'
save_pickle(test_userid, filename+'userid.pickle')
save_pickle(test_rating, filename+'rating.pickle')
#save_pickle(test_docs, filename+'docs.pickle')
save_pickle(test_docs_csr, filename+'docs_csr.pickle')
print 'Testing Data saved into pickle'

Testing Data saved into pickle


## Create CSR Matrix for Merged data

In [15]:
# Create Merge Data
merged_userid = userid + test_userid
merged_rating = rating + test_rating
merged_docs = docs + test_docs

# CSR For Merged data
csr_mat3 = build_matrix(merged_docs)
print 'Merged CSR Formed'
mat3 = csr_idf(csr_mat3, copy=True)
print 'Merged IDF Formed'
merged_docs_csr = csr_l2normalize(mat3, copy=True)
print 'Merged L2Norm Formed'

Merged CSR Formed
Merged IDF Formed
Merged L2Norm Formed


## Save Merged Data into pickle

In [16]:
filename = 'pickle/merged/'
save_pickle(merged_userid, filename+'userid.pickle')
save_pickle(merged_rating, filename+'rating.pickle')
#save_pickle(merged_docs, filename+'docs.pickle')
save_pickle(merged_docs_csr, filename+'docs_csr.pickle')
print 'Merged Data saved into pickle'

Merged Data saved into pickle
