# Custom logistic classifier to add weights to words for sentiment analysis

In [7]:
import turicreate as tc
import numpy as np

In [42]:
products = tc.SFrame("amazon_baby_subset.sframe")

In [6]:
import json
with open('important_words.json', 'r') as f: # Reads the list of most frequent words
    important_words = json.load(f)
important_words = [str(s) for s in important_words]

In [8]:
import string 
def remove_punctuation(text):
    translator = text.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    return text

products['review_clean'] = products['review'].apply(remove_punctuation)

In [9]:
for word in important_words:
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

In [10]:
products['contains_perfect'] = products['perfect'].apply(lambda n : n != 0)
products['contains_perfect'].sum()

2955

In [11]:
def get_numpy_data(data_sframe, features, label):
    data_sframe['intercept'] = 1
    features = ['intercept'] + features
    features_sframe = data_sframe[features]
    feature_matrix = features_sframe.to_numpy()
    label_sarray = data_sframe[label]
    label_array = label_sarray.to_numpy()
    return(feature_matrix, label_array)

In [12]:
feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment')

In [13]:
feature_matrix.shape

(53072, 194)

In [20]:
'''
produces probablistic estimate for P(y_i = +1 | x_i, w).
estimate ranges between 0 and 1.
'''
def predict_probability(feature_matrix, coefficients):
    # Take dot product of feature_matrix and coefficients  
    dot = np.dot(feature_matrix,coefficients)
    
    # Compute P(y_i = +1 | x_i, w) using the link function
    
    predictions = 1/(1 + np.exp(-dot))
    
    # return predictions
    return predictions

In [28]:
def feature_derivative(errors, feature):     
    # Compute the dot product of errors and feature
    derivative = np.dot(errors,feature)
    
    # Return the derivative
    return derivative

In [29]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator = (sentiment==+1)
    scores = np.dot(feature_matrix, coefficients)
    logexp = np.log(1. + np.exp(-scores))
    
    # Simple check to prevent overflow
    mask = np.isinf(logexp)
    logexp[mask] = -scores[mask]
    
    lp = np.sum((indicator-1)*scores - logexp)
    return lp

In [30]:
from math import sqrt

def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy array
    for itr in range(max_iter):

        # Predict P(y_i = +1|x_i,w) using your predict_probability() function
        
        predictions = predict_probability(feature_matrix,coefficients)
        
        # Compute indicator value for (y_i = +1)
        indicator = (sentiment==+1)
        
        # Compute the errors as indicator - predictions
        errors = indicator - predictions
        
        for j in range(len(coefficients)): # loop over each coefficient
            
            # Recall that feature_matrix[:,j] is the feature column associated with coefficients[j].
            # Compute the derivative for coefficients[j]. Save it in a variable called derivative
            
            derivative = feature_derivative(errors,feature_matrix[:,j])
            
            # add the step size times the derivative to the current coefficient
            coefficients[j] += step_size*derivative
        
        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            print('iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp))
    return coefficients

In [31]:
coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients=np.zeros(194),
                                   step_size=1e-7, max_iter=301)

iteration   0: log likelihood of observed labels = -36780.91768478
iteration   1: log likelihood of observed labels = -36775.13434712
iteration   2: log likelihood of observed labels = -36769.35713564
iteration   3: log likelihood of observed labels = -36763.58603240
iteration   4: log likelihood of observed labels = -36757.82101962
iteration   5: log likelihood of observed labels = -36752.06207964
iteration   6: log likelihood of observed labels = -36746.30919497
iteration   7: log likelihood of observed labels = -36740.56234821
iteration   8: log likelihood of observed labels = -36734.82152213
iteration   9: log likelihood of observed labels = -36729.08669961
iteration  10: log likelihood of observed labels = -36723.35786366
iteration  11: log likelihood of observed labels = -36717.63499744
iteration  12: log likelihood of observed labels = -36711.91808422
iteration  13: log likelihood of observed labels = -36706.20710739
iteration  14: log likelihood of observed labels = -36700.5020

In [33]:
scores = np.dot(feature_matrix, coefficients)
p,n=0,0
for i in scores:
    if i>0:
        p+=1
    else:
        n+=1
print(p,n)

25126 27946


In [45]:
num_mistakes = len(products['sentiment']) - p
accuracy  = (len(products) - num_mistakes)/len(products)
accuracy

0.4734323183599638

In [48]:
coefficients = list(coefficients[1:]) # exclude intercept
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coefficients)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=False)

In [49]:
word_coefficient_tuples

[('use', -0.05386014844520313),
 ('bought', -0.0415110333921089),
 ('day', -0.03898203728648711),
 ('bottles', -0.03306951529475273),
 ('first', -0.030051249236035808),
 ('however', -0.028978976142317068),
 ('well', -0.028711552980192588),
 ('still', -0.02774269723066133),
 ('waste', -0.026592778462247283),
 ('money', -0.02448210054589172),
 ('cup', -0.02404274807115496),
 ('tried', -0.02139434854368248),
 ('go', -0.019846256660777207),
 ('monitor', -0.01870237142432583),
 ('clean', -0.018359460662945686),
 ('son', -0.018246193486087036),
 ('since', -0.017737543997218053),
 ('never', -0.017137867010854794),
 ('come', -0.016001798500102516),
 ('took', -0.015537698955653887),
 ('thing', -0.015040658977043393),
 ('hard', -0.014977044903587944),
 ('disappointed', -0.01486631944997698),
 ('actually', -0.014709833465080667),
 ('quality', -0.013868727265297587),
 ('item', -0.0137566767312614),
 ('recommend', -0.013213475301677043),
 ('thought', -0.012969046546319293),
 ('reviews', -0.01280883