In [1]:
import pandas as pd
from gensim import corpora, models
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess
import numpy as np
import nltk
import pickle
from lda_embedder.text_embedder import TextEmbedder
import random
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression

## Effects of Sequencial Topics

We believe that Yelp’s star rating system does not reflect time-sensitive information of business performance (whether or not the business is improving at given time), which the users generally care more about. In addition to establishing a metric that represents temporary performance, we wanted to analyze why the business is undergoing a successful period and if not, suggest how to improve them. For this reason, we could not simply take the average of most recent N reviews, but rather, needed to investigate the contents of review text.

## Embeddings 

We propose 3 different embeddings, which is our implementations of modified LDA. We used LDA for following reasons:  
- We can easily map texts to vetcor, unlike word2vec, which maps word to vector
- High interpretability
- Adjustable based on time or based on business. (Our embedding for the identical raw texts will look different if the business is different)

In [2]:
# load file
business = pd.read_csv('data/chinese_business_clean.csv')
reviews = pd.read_csv('data/chinese_reviews_clean_offsets.csv')

In [3]:
# load pretrained topic models
lda =  models.LdaModel.load('data/gensim/lda.model')
dictionary = corpora.Dictionary.load('data/gensim/chinsese_dict.dict')
user1 = reviews[reviews['user_id'] == 'CxDOIDnH8gp9KXzpBHJYXw']

In [4]:
# helper function
def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

In [5]:
# use this sample to embed
sample = reviews['text'].values[0]
reviews['text'].values[0], reviews['business_id'].values[0]

('This place is horrible, we were so excited to try it since I got a gift card for my birthday. We went in an ordered are whole meal and they did not except are gift card, because their system was down. Unacceptable, this would have been so helpful if we would have known this prior!!',
 'jQsNFOzDpxPmOurSWCg1vQ')

### 0. embed (not using this)
Simply uses gensim built-in get_document_topics function 

In [6]:
def embed(text, model, dictionary):
    text = tokenize(text)
    bow = dictionary.doc2bow(text)
    kindex = model.get_document_topics(bow)
    out = [0] * model.num_topics
    for i, p in kindex:
        out[i] = p
    return np.array(out) 

In [7]:
embed(sample, lda, dictionary)

array([ 0.        ,  0.        ,  0.94487847,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  

### 1. embed_sent (baseline)

Apply Dirichlet distribution after tokenize by sentences.

$\theta_{d} = \frac{1}{n} \sum^{n}_{s \in d} Dirishlet(\alpha_{s})$

In [8]:
def embed_sent(text, model, dictionary, sent_length = False):
    out = np.array([0.]*128)
    sentences = len(nltk.sent_tokenize(text))
    for text in nltk.sent_tokenize(text):
        out += embed(text, lda, dictionary)
    if sent_length:
        return out/sentences, sentences
    return (out/sentences)

In [9]:
embed_sent(sample, lda, dictionary)

array([ 0.        ,  0.        ,  0.59200754,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.0398718 ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  

### 2. augmented embed  
The baseline embedding prones to leave most entries 0. This caused overfitting for the later evaluation. We needed to scale and smooth this value by certain parameter.

$\theta_{d}(\beta) = \beta + \beta * \frac{1}{n} \frac{\sum_{s \in d} ^ {n} Dirichlet(\alpha _{s})}{max (Dirichlet (\alpha_{s'})  \text{for} s' \in d)}$

In [10]:
def augmented_embed_sent(text, alpha = 0.5):
    out = np.array([0.]*128)
    sentences = len(nltk.sent_tokenize(text))
    for text in nltk.sent_tokenize(text):
        out += embed(text, lda, dictionary)
    
    out = alpha + alpha * out/max(out)
    
    return out/sum(out)

In [11]:
augmented_embed_sent(sample)

array([ 0.00774795,  0.00774795,  0.01549591,  0.00774795,  0.00774795,
        0.00774795,  0.00774795,  0.00774795,  0.00774795,  0.00774795,
        0.00774795,  0.00774795,  0.00774795,  0.00774795,  0.00774795,
        0.00774795,  0.00774795,  0.00774795,  0.00774795,  0.00774795,
        0.00774795,  0.00774795,  0.00774795,  0.00774795,  0.00774795,
        0.00774795,  0.00774795,  0.00774795,  0.00774795,  0.00774795,
        0.00774795,  0.00774795,  0.00774795,  0.00774795,  0.00774795,
        0.00774795,  0.00774795,  0.00774795,  0.00774795,  0.00774795,
        0.00774795,  0.00774795,  0.00774795,  0.00774795,  0.00774795,
        0.00774795,  0.00826203,  0.00774795,  0.00774795,  0.00774795,
        0.00774795,  0.00774795,  0.00774795,  0.00774795,  0.00774795,
        0.00774795,  0.00774795,  0.00774795,  0.00774795,  0.00774795,
        0.00774795,  0.00774795,  0.00774795,  0.00774795,  0.00774795,
        0.00774795,  0.00774795,  0.00774795,  0.00774795,  0.00

### 3. augmented embed business tfidf   
Finally we scale the most characteristic topic for each business by using tfidf  

$\theta_{d}(\beta, b) = \text{augmented}(d) * \text{augmented}(d_{b}) * log \frac{N_{s_{b}}}{|s \in d_{b} : t \in s| + \gamma}$

In [12]:
with open('data/b_tfidf.pickle', 'rb') as f:
    business_tfidf_dict = pickle.load(f)

In [13]:
def augmented_tf_business_tfidf(text, business_id, alpha = 0.5, minimum_probability = 0.0):
    tf = augmented_embed_sent(text, alpha)
    btfidf = business_tfidf_dict[business_id]
    out = np.multiply(tf, btfidf)
    if sum(out) == 0.0:
        print ('Business has too low tfidf')
        return np.array([0.]*128)
    return out/sum(out)

In [14]:
augmented_tf_business_tfidf(sample, 'jQsNFOzDpxPmOurSWCg1vQ')

array([ 0.00823063,  0.00850354,  0.01664549,  0.00719256,  0.00975305,
        0.00760763,  0.00659569,  0.00822557,  0.00680431,  0.00859019,
        0.00837075,  0.00803532,  0.00643756,  0.0072259 ,  0.0071042 ,
        0.00712047,  0.00890572,  0.00725679,  0.00763082,  0.00725775,
        0.00715523,  0.00645387,  0.00853101,  0.00701878,  0.00760285,
        0.00852419,  0.00732912,  0.00807017,  0.00682027,  0.00682255,
        0.00848553,  0.00794957,  0.00673923,  0.00886794,  0.0069194 ,
        0.0091204 ,  0.00691605,  0.00751699,  0.00752805,  0.00734154,
        0.00678523,  0.00810036,  0.00721915,  0.00835811,  0.00693084,
        0.00806947,  0.00782072,  0.00602946,  0.00724106,  0.0074844 ,
        0.00704896,  0.00832074,  0.00604295,  0.01052008,  0.00698405,
        0.00875748,  0.00790735,  0.00800847,  0.00767964,  0.00811386,
        0.00813932,  0.0071059 ,  0.00662446,  0.00913259,  0.00764019,
        0.00773215,  0.00784225,  0.00719325,  0.00728189,  0.00

We implemented a wrapper class that implements all the embeddings more easily. 

In [15]:
embedder = TextEmbedder(model = lda, dictionary = dictionary, business_tfidf = business_tfidf_dict)

## Dataset

We wanted to analyze how much LDA can extract specific topics, rather than general topics. For this reason, we first filtered the business with 'chinese' tag, and get 3773 businesses. We also filtered corresponding reviews and also filtered meaningless ones (we found some reviews only ontained 1 word because of typo) and ended up using 175281 reviews. We also used 2 business that contains most reviews, pH0BLkL4cbxKzu471VZnuA and X8c23dur0ll2D9XTu-I8Qg. 

In [16]:
# use top2 growing, declining business for case study
case1 = reviews[reviews['business_id'] == 'pH0BLkL4cbxKzu471VZnuA'] # growing 

sample = random.sample(set(reviews['review_id'].values), len(case1))
sample1 = reviews[reviews['review_id'].isin(sample)]

case2 = reviews[reviews['business_id'] == 'X8c23dur0ll2D9XTu-I8Qg'] # declining

sample = random.sample(set(reviews['review_id'].values), len(case2))
sample2 = reviews[reviews['review_id'].isin(sample)]

We also preprocessed the labels so that it reflects business specific performance. For this reasons, we decided to take the difference between each review rating and the rating of business and defined as 'offsets'. Similarly, we grouped the reviews based on quarter (every 3 month), and defined the offsets. These valuses mean how positive/negative each review or quarter is to the specific business. Finally, in order to build a classifier, we label them into 3 classes (positives, neutrals, negatives) and 2 classes (positives and negatives).

In [17]:
# label data, try to predict simple labels -- positive(1), negative(-1) or average(0)
def labels(offsets):
    if offsets < 0.0:
        return -1
    else:
        return int(offsets > 0.0)
labels = np.vectorize(labels)

In [18]:
case1[['review_id', 'business_id', 'stars', 'business_offset', 'quarter_offset']].head()

Unnamed: 0,review_id,business_id,stars,business_offset,quarter_offset
48835,xehs7BV3CG_prSOVguvWRA,pH0BLkL4cbxKzu471VZnuA,5,1.0,1.125
48836,xDjjmA611W3wiYHBgiDd6g,pH0BLkL4cbxKzu471VZnuA,2,-2.0,-2.022472
48837,de4DtH_xNjYlbfqUY0Kj2A,pH0BLkL4cbxKzu471VZnuA,5,1.0,1.059701
48838,akys4VJyn7Ve4qhMK1zpgA,pH0BLkL4cbxKzu471VZnuA,4,0.0,-0.296296
48839,t8KdC_-TNI0xUwLsSUKrbw,pH0BLkL4cbxKzu471VZnuA,1,-3.0,-2.940299


## Experiments

We used classification task as a metric of how representitive each embedding is. In order to measure the accuracy, we used K-fold stratified cross validation and collect training accuracy and test accuracy for 3 different classifiers (SVM, xgboost, fully connected neural network). We also experiment with binary classifications (either positive and negative) and in this case, we collected precision, recall, and f-1 score. 

In [19]:
skf = StratifiedKFold(n_splits=5)

In [20]:
def cmat_to_accuracy(mat):
    size = len(mat)
    total = sum(sum(mat))
    correct = 0
    for i in range(size):
        correct += mat[i,i]
    return correct/total

In [21]:
def run_experiments(embed, yb):
    avg = []
    train = []

    avg_svm = []
    train_svm = []
    for x, i in skf.split(embed, yb):
        train_x = embed[x]
        train_y = yb[x]
        test_x = embed[i]
        test_y = yb[i]
        model = XGBClassifier()
        model.fit(train_x, train_y)
        train.append(cmat_to_accuracy(confusion_matrix(model.predict(train_x), train_y)))
        avg.append(cmat_to_accuracy(confusion_matrix(model.predict(test_x), test_y)))
        svm = SVC()
        svm.fit(train_x, train_y)
        train_svm.append(cmat_to_accuracy(confusion_matrix(svm.predict(train_x), train_y)))
        avg_svm.append(cmat_to_accuracy(confusion_matrix(svm.predict(test_x), test_y)))

    print ('svm')
    print ('Train set\n avg: {}'.format(np.mean(train_svm)), 'var: {}'.format(np.var(train_svm)))
    print ('Test set\n avg: {}'.format(np.mean(avg_svm)), 'var: {}'.format(np.var(avg_svm)))

    print ('xgboost')
    print ('Train set\n avg: {}'.format(np.mean(train)), 'var: {}'.format(np.var(train)))
    print ('Test set\n avg: {}'.format(np.mean(avg)), 'var: {}'.format(np.var(avg)))

### 1. Predict business offset (3 labels)

In [22]:
y = case1['business_offset'].values
y = labels(y)

y2 = sample1['business_offset'].values
y2 = labels(y2)

Run classifiers with random businesses

#### Embedding 1

In [23]:
embed = np.array([embedder.embed_sent(t) for t, b in zip(sample1['text'].values, sample1['business_id'].values)])
run_experiments(embed, y2)

svm
Train set
 avg: 0.4907597676149698 var: 1.6195278891527505e-08
Test set
 avg: 0.49075997903524715 var: 2.5857951078284415e-07
xgboost
Train set
 avg: 0.7615508054434093 var: 3.469828938301724e-06
Test set
 avg: 0.6160137804982282 var: 0.0001558430957570951


#### Embedding 2

In [24]:
embed = np.array([embedder.augmented_embed_text(t) for t, b in zip(sample1['text'].values, sample1['business_id'].values)])
run_experiments(embed, y2)

svm
Train set
 avg: 0.4907597676149698 var: 1.6195278891527505e-08
Test set
 avg: 0.49075997903524715 var: 2.5857951078284415e-07
xgboost
Train set
 avg: 0.7846512627621912 var: 3.779457095430929e-05
Test set
 avg: 0.6232051526493962 var: 0.00021488974937629448


#### Embedding 3

In [25]:
embed = np.array([embedder.augmented_tf_business_tfidf(t, b) for t, b in zip(sample1['text'].values, sample1['business_id'].values)])
run_experiments(embed, y2)

svm
Train set
 avg: 0.4907597676149698 var: 1.6195278891527505e-08
Test set
 avg: 0.49075997903524715 var: 2.5857951078284415e-07
xgboost
Train set
 avg: 0.8633237913377462 var: 7.954979717625874e-05
Test set
 avg: 0.5878110866050299 var: 0.001295346212860533


Run classifiers with specific businesses

#### Embedding 1

In [26]:
embed = np.array([embedder.embed_sent(t) for t, b in zip(case1['text'].values, case1['business_id'].values)])
run_experiments(embed, y)

svm
Train set
 avg: 0.4517454988315065 var: 5.3695907774017384e-08
Test set
 avg: 0.4517472905101771 var: 8.657901575484515e-07
xgboost
Train set
 avg: 0.7737423060465421 var: 0.00012103679548114681
Test set
 avg: 0.5657229711868886 var: 0.00040361104595184986


#### Embedding 2

In [27]:
embed = np.array([embedder.augmented_embed_text(t) for t, b in zip(case1['text'].values, case1['business_id'].values)])
run_experiments(embed, y)

svm
Train set
 avg: 0.4517454988315065 var: 5.3695907774017384e-08
Test set
 avg: 0.4517472905101771 var: 8.657901575484515e-07
xgboost
Train set
 avg: 0.7931203712846845 var: 4.605165625937355e-05
Test set
 avg: 0.5590510177108114 var: 0.00026273726618568285


#### Embedding 3

In [28]:
embed = np.array([embedder.augmented_tf_business_tfidf(t, b) for t,b in zip(case1['text'].values, case1['business_id'].values)])
run_experiments(embed, y)

svm
Train set
 avg: 0.4517454988315065 var: 5.3695907774017384e-08
Test set
 avg: 0.4517472905101771 var: 8.657901575484515e-07
xgboost
Train set
 avg: 0.790040650406504 var: 2.0497314965717702e-05
Test set
 avg: 0.5405762622257468 var: 0.0003982131618355532


### 2. Predict business offset (2 labels)

Run classifiers with random businesses

In [29]:
case1b = case1[case1['business_offset'] != 0.0]

reviewsb = reviews[reviews['business_offset'] != 0.0]
sample = random.sample(set(reviewsb['review_id'].values), len(case1))
sample1b = reviewsb[reviewsb['review_id'].isin(sample)]

In [30]:
yb = case1b['business_offset'].values
yb = labels(yb)

y2b = sample1b['business_offset'].values
y2b = labels(y2b)

#### Embedding 1

In [31]:
embed = np.array([embedder.embed_sent(t) for t, b in zip(sample1b['text'].values, sample1b['business_id'].values)])
run_experiments(embed, y2b)

svm
Train set
 avg: 0.5734086271257548 var: 1.5483592887082267e-08
Test set
 avg: 0.5734086688814859 var: 2.4683642457263684e-07
xgboost
Train set
 avg: 0.834447436576653 var: 3.5282840324723616e-05
Test set
 avg: 0.7289598087345608 var: 0.00011339071013990502


#### Embedding 2

In [32]:
embed = np.array([embedder.augmented_embed_text(t) for t, b in zip(sample1b['text'].values, sample1b['business_id'].values)])
run_experiments(embed, y2b)

svm
Train set
 avg: 0.5734086271257548 var: 1.5483592887082267e-08
Test set
 avg: 0.5734086688814859 var: 2.4683642457263684e-07
xgboost
Train set
 avg: 0.8560062210533875 var: 1.3571059885540267e-05
Test set
 avg: 0.7192029887416445 var: 4.3955312735830466e-05


#### Embedding 3

In [33]:
embed = np.array([embedder.augmented_tf_business_tfidf(t, b) for t, b in zip(sample1b['text'].values, sample1b['business_id'].values)])
run_experiments(embed, y2b)

svm
Train set
 avg: 0.5734086271257548 var: 1.5483592887082267e-08
Test set
 avg: 0.5734086688814859 var: 2.4683642457263684e-07
xgboost
Train set
 avg: 0.8948921200946751 var: 9.954328574180222e-06
Test set
 avg: 0.6992107839344179 var: 0.00029403906315437003


Run classifiers with specific businesses

#### Embedding 1

In [34]:
embed = np.array([embedder.embed_sent(t) for t, b in zip(case1b['text'].values, case1b['business_id'].values)])
run_experiments(embed, yb)

svm
Train set
 avg: 0.6312769821752896 var: 5.1213820734668746e-08
Test set
 avg: 0.6312782032438565 var: 8.238484328303165e-07
xgboost
Train set
 avg: 0.9011839207933523 var: 1.0371038432887464e-06
Test set
 avg: 0.7847889430391171 var: 5.775199855071701e-05


#### Embedding 2

In [35]:
embed = np.array([embedder.augmented_embed_text(t) for t, b in zip(case1b['text'].values, case1b['business_id'].values)])
run_experiments(embed, yb)

svm
Train set
 avg: 0.6312769821752896 var: 5.1213820734668746e-08
Test set
 avg: 0.6312782032438565 var: 8.238484328303165e-07
xgboost
Train set
 avg: 0.9133789800215375 var: 1.720690797712318e-05
Test set
 avg: 0.7934091436528197 var: 8.178626590605422e-05


#### Embedding 3

In [36]:
embed = np.array([embedder.augmented_tf_business_tfidf(t, b) for t,\
                  b in zip(case1b['text'].values, case1b['business_id'].values)])
run_experiments(embed, yb)

svm
Train set
 avg: 0.6312769821752896 var: 5.1213820734668746e-08
Test set
 avg: 0.6312782032438565 var: 8.238484328303165e-07
xgboost
Train set
 avg: 0.9166049472009258 var: 6.49514206750849e-05
Test set
 avg: 0.7890745468141616 var: 0.000572446360040989


### 3. Predict quarter offset (3 labels)

In [37]:
def embed_by_enum(data, embedder, enum = 0, binary = False):
    # select regions
    if binary:
        data = data[data['quarter_offset'] != 0]
    label = data['quarter_offset']
    # create labels
    if enum == 2: 
        embed = np.array([embedder.augmented_embed_text(t) for t in data['text'].values])
    elif enum == 3: 
        embed = np.array([embedder.user_tfidf_embed(t, u) for t, u in zip(data['text'].values, data['user_id'].values)])
    elif enum == 4: 
        embed = np.array([embedder.user_tf_business_idf(t, b) for t, b in zip(data['text'].values, data['business_id'].values)])
    elif enum == 5: 
        embed = np.array([embedder.user_tfidf_business_idf(t, u, b) for t, u, b in zip(data['text'].values, data['user_id'].values, data['business_id'].values)])
    
    elif enum == 0: 
        embed = np.array([embedder.embed(t) for t in data['text'].values])
    elif enum == 1:
        embed = np.array([embedder.embed_sent(t) for t in data['text'].values])
    
    elif enum == 6:
        embed = np.array([embedder.augmented_tf_business_tfidf(t, b) for t, b in zip(data['text'].values, data['business_id'].values)])
    else:
        print ('enum {} is not supported'.format(enum))
        return None
    return embed, label

In [38]:
# helper to build sequential data, fitting a linear regression and taking a slope
# on a side note, I also tried with SVR but the line of best fit tends to be horizontal, aka simply finding the average
def build_data(df, enum):
    qs = sorted(list(set(df['quarter'])))
    X = []
    y = []
    for q in qs:
        filtered = df[df['quarter'] == q]
        embed, labels = embed_by_enum(filtered, embedder, enum)
        
        # fit on the sequence, store the slope
        regr = LinearRegression()
        regr.fit(np.arange(len(embed)).reshape(-1,1),embed)
        X.append(regr.coef_.reshape(1,-1)[0])
        
        y.append(list(set(filtered['quarter_avg'].values))[0])
    return np.array(X), np.array(y)

#### Embedding 1

In [39]:
x,y = build_data(case1, 1)
y = y - case1['business_stars'].values[0]

In [40]:
y = labels(y)

In [41]:
run_experiments(x,y)



svm
Train set
 avg: 0.5409523809523809 var: 0.00023219954648526052
Test set
 avg: 0.5460317460317461 var: 0.0025799949609473416
xgboost
Train set
 avg: 1.0 var: 0.0
Test set
 avg: 0.8063492063492064 var: 0.013948097757621566


#### Embedding 2

In [42]:
x2,y2 = build_data(case1, 2)
y2 = y2 - case1['business_stars'].values[0]

In [None]:
y2 = labels(y2)
run_experiments(x2,y2)



svm
Train set
 avg: 0.5409523809523809 var: 0.00023219954648526052
Test set
 avg: 0.5460317460317461 var: 0.0025799949609473416
xgboost
Train set
 avg: 1.0 var: 0.0
Test set
 avg: 0.4222222222222222 var: 0.04460569412950366


#### Embedding 3

In [None]:
x3,y3 = build_data(case1, 6)
y3 = y3 - case1['business_stars'].values[0]

In [None]:
y3 = labels(y3)
run_experiments(x3,y3)