In [None]:
import numpy as np
np.set_printoptions(edgeitems=10,linewidth=180, precision=4)
import pandas as pd
from sklearn import linear_model
from google.colab import drive
import matplotlib.pyplot as plt
import string
import json
from math import sqrt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
%matplotlib inline

In [None]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


### 1. Read data

In [None]:
products = pd.read_csv('gdrive/My Drive/uwml/amazon_baby_subset.csv')
products.head()

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1


In [None]:
# Count sentiment
products.sentiment.value_counts()

 1    26579
-1    26493
Name: sentiment, dtype: int64

#### 2. Clean punctuation

In [None]:
# Load words dataset
with open('gdrive/My Drive/uwml/important_words.json', 'r') as words:
    important_words = json.load(words)

important_words[0:14]

['baby',
 'one',
 'great',
 'love',
 'use',
 'would',
 'like',
 'easy',
 'little',
 'seat',
 'old',
 'well',
 'get',
 'also']

#### 3/4. Apply text cleaning

In [None]:
# Fill na's
products = products.fillna({'review':''})

# Remove punctuation
trans_table = str.maketrans(dict.fromkeys(string.punctuation, ''))

def remove_punctuation(text):
    return text.translate(trans_table)

products['review_clean'] = products['review'].astype(str).apply(remove_punctuation)

products.head()

Unnamed: 0,name,review,rating,sentiment,review_clean
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried nonstop when I tried...
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1,We wanted to get something to keep track of ou...
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1,My daughter had her 1st baby over a year ago S...
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1,One of babys first and favorite books and it i...
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1,Very cute interactive book My son loves this b...


#### 5/6. Word counts

In [None]:
for word in important_words:
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

products

  products[word] = products['review_clean'].apply(lambda s : s.split().count(word))


Unnamed: 0,name,review,rating,sentiment,review_clean,baby,one,great,love,use,...,seems,picture,completely,wish,buying,babies,won,tub,almost,either
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried nonstop when I tried...,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1,We wanted to get something to keep track of ou...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1,My daughter had her 1st baby over a year ago S...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1,One of babys first and favorite books and it i...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1,Very cute interactive book My son loves this b...,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53067,"Samsung Baby Care Washer, Stainless Platinum, ...","My infant goes to a really crappy daycare, and...",1,-1,My infant goes to a really crappy daycare and ...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53068,"Mud Pie Milestone Stickers, Boy",Pretty please open and inspect these stickers ...,1,-1,Pretty please open and inspect these stickers ...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53069,Best BIB for Baby - Soft Bib (Pink-Elephant),Great 5-Star Product but An Obvious knock-off ...,1,-1,Great 5Star Product but An Obvious knockoff of...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53070,Bouncy&reg; Inflatable Real Feel Hopping Cow,When I received the item my initial thought wa...,2,-1,When I received the item my initial thought wa...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### 7. Count the number of times the word "perfect" appears

In [None]:
# Quiz 1
print('Sum of the word perfect', sum(products['perfect']))
print('Count of the word perfect', len(products[products['perfect'] >= 1 ]['review_clean']))

Sum of the word perfect 3207
Count of the word perfect 2955


In [None]:
products['contains_perfect'] = products['review'].apply(lambda x: 1 if 'perfect' in x else 0)
print('Count of the word perfect', len(products[products['contains_perfect'] >= 1 ]['review_clean']))

Count of the word perfect 4246


In [None]:
products['review_lower'] = products['review'].apply(lambda x: x.lower())
products['contains_perfect_lower'] = products['review_lower'].apply(lambda x: 1 if 'perfect' in x else 0)
print('Count of the word perfect (lower case)', len(products[products['contains_perfect_lower'] >= 1 ]['review_clean']))

Count of the word perfect (lower case) 4628


In [None]:
products['contains_perfect0'] = products['perfect'] >=1
sum(products['contains_perfect0'])

2955

#### 8/9. Convert data to arrrays

In [None]:
def get_numpy_data(dataframe, features, label):
    dataframe['constant'] = 1
    features = ['constant'] + features
    feature_frame = dataframe[features]
    label_series = dataframe[label]
    return(feature_frame.values, label_series.values)

feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment')

In [None]:
# Quiz 2
feature_matrix.shape[1]

194

#### 10. Estimate conditional probability

In [None]:
# Dummy data for features
fm = np.random.rand(10,3)
print(fm.shape)
fm

(10, 3)


array([[0.3226, 0.2663, 0.1765],
       [0.8807, 0.6489, 0.3069],
       [0.713 , 0.8677, 0.961 ],
       [0.4817, 0.6499, 0.1117],
       [0.5779, 0.1482, 0.6758],
       [0.3799, 0.7948, 0.0709],
       [0.3998, 0.4653, 0.2838],
       [0.2557, 0.7974, 0.9266],
       [0.9115, 0.9294, 0.9835],
       [0.9551, 0.244 , 0.4228]])

In [None]:
# Dummy data for coefficients
#coef = np.random.rand(3,1)
coef = np.array([[-1],[0.5],[1]])
print(coef.shape)
print("\n")
print(coef)
print("\n")
print(coef.T)

(3, 1)


[[-1. ]
 [ 0.5]
 [ 1. ]]


[[-1.   0.5  1. ]]


In [None]:
# Check dot product calc.
print(np.dot(fm, coef))
#print(np.dot(coef, fm))

[[-0.0129]
 [-0.2493]
 [ 0.6819]
 [-0.045 ]
 [ 0.172 ]
 [ 0.0884]
 [ 0.1167]
 [ 1.0695]
 [ 0.5367]
 [-0.4104]]


In [None]:
# Dummy data for y / sentiment
sent = np.dot(fm, coef).squeeze(-1) + np.random.rand(10,) / 10    #([1,0,1,1,0,1,1,0]])
sent = sent > np.median(sent)
sent = sent * 1
sent

array([0, 0, 1, 0, 1, 0, 1, 1, 1, 0])

In [None]:
def predict_probability(feature_matrix, coefficients):
    # Take dot product of feature_matrix and coefficients  
    score = np.dot(feature_matrix, coefficients)
    
    # Compute P(y_i = +1 | x_i, w) using the link function
    predictions = 1 / (1 + np.exp(-score))
    
    # return predictions
    return predictions

In [None]:
predict_probability(fm, coef)

array([[0.4968],
       [0.438 ],
       [0.6642],
       [0.4888],
       [0.5429],
       [0.5221],
       [0.5291],
       [0.7445],
       [0.631 ],
       [0.3988]])

#### 11. Compute derivative of log-likelihood

In [None]:
def feature_derivative(errors, feature):
  # Dot product of errors and features
  derivative = np.dot(errors, feature)
  return derivative

#### 12. Compute log-likelihood

In [None]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator = (sentiment == +1)
    scores = np.dot(feature_matrix, coefficients)
    lp = np.sum((indicator - 1) * scores - np.log(1. + np.exp(-scores)))
    return lp

In [None]:
compute_log_likelihood(fm, sent, coef)

-71.96040365581601

#### 13 / 14. Gradient ascent

In [None]:
def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy array
    for itr in range(max_iter):
        # Predict P(y_i = +1|x_1,w) using your predict_probability() function
        predictions = predict_probability(feature_matrix, coefficients)

        # Compute indicator value for (y_i = +1)
        indicator = (sentiment==+1)

        # Compute the errors as indicator - predictions
        errors = indicator - predictions

        for j in range(len(coefficients)): # loop over each coefficient
            # Recall that feature_matrix[:,j] is the feature column associated with coefficients[j]
            # compute the derivative for coefficients[j]. Save it in a variable called derivative
            derivative = feature_derivative(errors, feature_matrix[:,j])

            # add the step size times the derivative to the current coefficient
            coefficients[j] = coefficients[j] + (derivative * step_size)
            ##xx = errors ##coefficients[j] + (derivative * step_size)

        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            print('iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp))
    return coefficients

In [None]:
coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients=np.zeros(194), step_size=1e-7, max_iter=301)

iteration   0: log likelihood of observed labels = -36780.91745462
iteration   1: log likelihood of observed labels = -36775.13388766
iteration   2: log likelihood of observed labels = -36769.35644771
iteration   3: log likelihood of observed labels = -36763.58511684
iteration   4: log likelihood of observed labels = -36757.81987727
iteration   5: log likelihood of observed labels = -36752.06071132
iteration   6: log likelihood of observed labels = -36746.30760149
iteration   7: log likelihood of observed labels = -36740.56053039
iteration   8: log likelihood of observed labels = -36734.81948075
iteration   9: log likelihood of observed labels = -36729.08443548
iteration  10: log likelihood of observed labels = -36723.35537756
iteration  11: log likelihood of observed labels = -36717.63229015
iteration  12: log likelihood of observed labels = -36711.91515650
iteration  13: log likelihood of observed labels = -36706.20396002
iteration  14: log likelihood of observed labels = -36700.4986

In [None]:
len(coefficients)

194

#### 15. Predicting sentiment

In [None]:
preds_score = np.dot(feature_matrix, coefficients)
preds_score

array([ 0.0511, -0.0294,  0.0241,  0.0079,  0.1318,  0.1298,  0.008 ,  0.0398,  0.0073, -0.0523, ..., -0.0781,  0.0052, -0.0649, -0.0183, -0.0272, -0.0105, -0.0056, -0.4099,
        0.0141, -0.0676])

In [None]:
print('Positive review:', sum(preds_score > 0))
print('Negative review:', sum(preds_score < 0))

Positive review: 25128
Negative review: 27944


#### 16. Accuracy

In [None]:
# Quiz question 4
preds_prob = predict_probability(feature_matrix, coefficients)
accuracy = sum(np.where(sentiment > 0, 1, 0) == np.where(preds_prob > 0.5, 1, 0)) / len(sentiment)
print('Accuracy: ', f'{accuracy:.3f}')

Accuracy:  0.752


#### 17. Which words contribute most to positive & negative sentiments

In [None]:
coefficients1 = list(coefficients[1:]) # exclude intercept
word_coefficient_tuples = [(word, coefficient1) for word, coefficient1 in zip(important_words, coefficients1)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)

#### 18. Positive words

In [None]:
word_coefficient_tuples[:10]

[('great', 0.0665465684076429),
 ('love', 0.06589105285011145),
 ('easy', 0.06479500748256956),
 ('little', 0.04543615120028909),
 ('loves', 0.04497650326344663),
 ('well', 0.03013544683789827),
 ('perfect', 0.029740056726924896),
 ('old', 0.02007800844559498),
 ('nice', 0.018408957844935254),
 ('daughter', 0.01770350474519861)]

##### 19. Negative words

In [None]:
word_coefficient_tuples[-10:]

[('monitor', -0.024481826145288252),
 ('return', -0.026592670248870545),
 ('back', -0.027756667966667054),
 ('get', -0.028725181505074398),
 ('disappointed', -0.02897887627267753),
 ('even', -0.03005075362517889),
 ('work', -0.033069206353915095),
 ('money', -0.03898184228437664),
 ('product', -0.04151058927576694),
 ('would', -0.05385907076924775)]