<a href="https://colab.research.google.com/github/Brent-Morrison/uwml/blob/main/uwml_classification_w1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
np.set_printoptions(edgeitems=10,linewidth=180, precision=4)
import pandas as pd
from sklearn import linear_model
from google.colab import drive
import matplotlib.pyplot as plt
import string
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
%matplotlib inline

In [2]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


### 1. Read data

In [3]:
products = pd.read_csv('gdrive/My Drive/uwml/amazon_baby.csv')
products.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [4]:
# Check type of column "review"
print(products.dtypes)
print(type(products['review'][0]))

name      object
review    object
rating     int64
dtype: object
<class 'str'>


#### 2. Clean punctuation

In [5]:
trans_table = str.maketrans(dict.fromkeys(string.punctuation, ''))

def remove_punctuation(text):
    return text.translate(trans_table)

products['review_clean'] = products['review'].astype(str).apply(remove_punctuation)

products.head()

Unnamed: 0,name,review,rating,review_clean
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3,These flannel wipes are OK but in my opinion n...
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...


In [6]:
# Fill NA's
products = products.fillna({'review':''})

#### 3/4. Ignore neutral sentiment and assign positive / negative indicator

In [7]:
products = products[products['rating'] != 3].copy()
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)
products.head()

Unnamed: 0,name,review,rating,review_clean,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...,1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...,1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,When the Binky Fairy came to our house we didn...,1


#### 5. Load indices data & create train / test sets

In [8]:
with open('gdrive/My Drive/uwml/module-2-assignment-train-idx.json', 'r') as tr:
    train_idx = json.load(tr)

with open('gdrive/My Drive/uwml/module-2-assignment-test-idx.json', 'r') as te:
    test_idx = json.load(te)

In [9]:
train_data = products.iloc[train_idx].copy()
test_data = products.iloc[test_idx].copy()

#### 6. Build word count vector

In [10]:
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b') # Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])

#### 7. Fit logistic regression model

In [11]:
sentiment_model = LogisticRegression(max_iter=100).fit(X=train_matrix, y=products['sentiment'].iloc[train_idx])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
print('Number of features seen during fit: '.ljust(45)+': '+f'{sentiment_model.n_features_in_:,}')
coefficients = sentiment_model.coef_
print('Length of coefficients array: '.ljust(45)+': '+f'{coefficients.shape[1]:,}')

Number of features seen during fit:          : 121,713
Length of coefficients array:                : 121,713


#### 8. Number of positive coefficients

In [13]:
# Quiz 1
print('Positive coefficients', f'{sum(coefficients[0] > 0):,}')
print('Negative coefficients', f'{sum(coefficients[0] <= 0):,}')

Positive coefficients 90,250
Negative coefficients 31,463


#### 9. Predictions

In [14]:
sample_test_data = test_data[10:13]#.reset_index()
sample_test_data

Unnamed: 0,name,review,rating,review_clean,sentiment
59,Our Baby Girl Memory Book,Absolutely love it and all of the Scripture in...,5,Absolutely love it and all of the Scripture in...,1
71,Wall Decor Removable Decal Sticker - Colorful ...,Would not purchase again or recommend. The dec...,2,Would not purchase again or recommend The deca...,-1
91,New Style Trailing Cherry Blossom Tree Decal R...,Was so excited to get this product for my baby...,1,Was so excited to get this product for my baby...,-1


In [15]:
sample_test_data.iloc[0]['review']

'Absolutely love it and all of the Scripture in it.  I purchased the Baby Boy version for my grandson when he was born and my daughter-in-law was thrilled to receive the same book again.'

In [16]:
sample_test_data.iloc[1]['review']

'Would not purchase again or recommend. The decals were thick almost plastic like and were coming off the wall as I was applying them! The would NOT stick! Literally stayed stuck for about 5 minutes then started peeling off.'

#### 10/11. Class prediction

In [17]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)
scores_manual = np.matmul(coefficients, np.matrix(sample_test_matrix.toarray().T)) + sentiment_model.intercept_

print(scores)
print(scores_manual)

[  5.0382  -3.1078 -10.7575]
[[  5.0382  -3.1078 -10.7575]]


In [18]:
print(sentiment_model.predict(sample_test_matrix))
print(np.sign(scores_manual))

[ 1 -1 -1]
[[ 1. -1. -1.]]


#### 12. Pobability predictions

In [19]:
# Quiz 2
def sigmoid_from_score(x):
  return 1 / (1 + np.exp(x))

print(sigmoid_from_score(scores))
print(sigmoid_from_score(scores_manual))

[0.0064 0.9572 1.    ]
[[0.0064 0.9572 1.    ]]


#### 13/14. Find extreme reviews

In [20]:
test_matrix = vectorizer.transform(test_data['review_clean'])
test_set_probs = sentiment_model.predict_proba(test_matrix)

Which class will be returned first?

In [21]:
sentiment_model.classes_

array([-1,  1])

##### Top 20

In [22]:
test_data['sentiment_model_prob'] = test_set_probs[:, 1]#np.round(test_set_scores[:, 1], 6)

In [23]:
top_20 = test_data.sort_values('sentiment_model_prob', ascending=False).head(20)
top_20

Unnamed: 0,name,review,rating,review_clean,sentiment,sentiment_model_prob
52631,Evenflo X Sport Plus Convenience Stroller - Ch...,After seeing this in Parent\'s Magazine and re...,5,After seeing this in Parents Magazine and read...,1,1.0
137034,Graco Pack \'n Play Element Playard - Flint,My husband and I assembled this Pack n\' Play ...,4,My husband and I assembled this Pack n Play la...,1,1.0
100166,"Infantino Wrap and Tie Baby Carrier, Black Blu...",I bought this carrier when my daughter was abo...,5,I bought this carrier when my daughter was abo...,1,1.0
133651,"Britax 2012 B-Agile Stroller, Red",[I got this stroller for my daughter prior to ...,4,I got this stroller for my daughter prior to t...,1,1.0
119182,Roan Rocco Classic Pram Stroller 2-in-1 with B...,Great Pram Rocco!!!!!!I bought this pram from ...,5,Great Pram RoccoI bought this pram from Europe...,1,1.0
140816,"Diono RadianRXT Convertible Car Seat, Plum",I bought this seat for my tall (38in) and thin...,5,I bought this seat for my tall 38in and thin 2...,1,1.0
168697,Graco FastAction Fold Jogger Click Connect Str...,Graco\'s FastAction Jogging Stroller definitel...,5,Gracos FastAction Jogging Stroller definitely ...,1,1.0
87017,Baby Einstein Around The World Discovery Center,I am so HAPPY I brought this item for my 7 mon...,5,I am so HAPPY I brought this item for my 7 mon...,1,1.0
114796,"Fisher-Price Cradle \'N Swing, My Little Snug...",My husband and I cannot state enough how much ...,5,My husband and I cannot state enough how much ...,1,1.0
97325,Freemie Hands-Free Concealable Breast Pump Col...,I absolutely love this product. I work as a C...,5,I absolutely love this product I work as a Cu...,1,1.0


In [24]:
# Quiz 3
top_20.name.value_counts()

Evenflo X Sport Plus Convenience Stroller - Christina                           1
Graco Pack \'n Play Element Playard - Flint                                     1
Stokke Scoot Stroller - Light Green                                             1
Simple Wishes Hands-Free Breastpump Bra, Pink, XS-L                             1
P\'Kolino Silly Soft Seating in Tias, Green                                     1
Baby Jogger City Mini GT Double Stroller, Shadow/Orange                         1
Buttons Cloth Diaper Cover - One Size - 8 Color Options                         1
Stork Craft Beatrice Combo Tower Chest, White                                   1
Britax Boulevard 70-G3 Convertible Car Seat Seat, Onyx                          1
Mamas &amp; Papas 2014 Urbo2 Stroller - Black                                   1
Evenflo 6 Pack Classic Glass Bottle, 4-Ounce                                    1
Freemie Hands-Free Concealable Breast Pump Collection System                    1
Fisher-Price Cra

##### Bottom 20

In [25]:
bottom_20 = test_data.sort_values('sentiment_model_prob', ascending=True).head(20)
bottom_20

Unnamed: 0,name,review,rating,review_clean,sentiment,sentiment_model_prob
94560,The First Years True Choice P400 Premium Digit...,Note: we never installed batteries in these un...,1,Note we never installed batteries in these uni...,-1,2.320084e-15
16042,Fisher-Price Ocean Wonders Aquarium Bouncer,We have not had ANY luck with Fisher-Price pro...,2,We have not had ANY luck with FisherPrice prod...,-1,7.424319e-15
120209,Levana Safe N\'See Digital Video Baby Monitor ...,This is the first review I have ever written o...,1,This is the first review I have ever written o...,-1,8.490513e-13
155287,VTech Communications Safe &amp; Sounds Full Co...,"This is my second video monitoring system, the...",1,This is my second video monitoring system the ...,-1,1.971202e-12
53207,Safety 1st High-Def Digital Monitor,We bought this baby monitor to replace a diffe...,1,We bought this baby monitor to replace a diffe...,-1,1.84367e-10
48694,Adiri BPA Free Natural Nurser Ultimate Bottle ...,I will try to write an objective review of the...,2,I will try to write an objective review of the...,-1,2.267341e-10
95420,One Step Ahead Hide-Away Extra Long Bed Rail,"I bought a brand new 56"" hide-away bed safety ...",1,I bought a brand new 56 hideaway bed safety ra...,-1,2.290569e-10
59546,Ellaroo Mei Tai Baby Carrier - Hershey,This is basically an overpriced piece of fabri...,1,This is basically an overpriced piece of fabri...,-1,2.953786e-10
81332,Cloth Diaper Sprayer--styles may vary,I bought this sprayer out of desperation durin...,1,I bought this sprayer out of desperation durin...,-1,3.803718e-10
176046,Baby Trend Inertia Infant Car Seat - Horizon,"I really wanted to love this seat; however, I ...",1,I really wanted to love this seat however I wo...,-1,4.718512e-10


In [26]:
# Quiz 4
bottom_20.name.value_counts()

The First Years True Choice P400 Premium Digital Monitor, 2 Parent Unit                                    1
Fisher-Price Ocean Wonders Aquarium Bouncer                                                                1
Keekaroo Height Right High Chair, Infant Insert and Tray Combo, Natural/Cherry                             1
Philips AVENT Newborn Starter Set                                                                          1
Snuza Portable Baby Movement Monitor                                                                       1
Safety 1st Exchangeable Tip 3 in 1 Thermometer                                                             1
VTech Communications Safe &amp; Sound Digital Audio Monitor with two Parent Units                          1
Safety 1st Deluxe 4-in-1 Bath Station                                                                      1
Peg-Perego Tatamia High Chair, White Latte                                                                 1
Baby Jogger Summit 

#### 15. Accuracy

In [27]:
test_set_class = sentiment_model.predict(test_matrix)
test_data['sentiment_model_class'] = test_set_class
test_data

Unnamed: 0,name,review,rating,review_clean,sentiment,sentiment_model_prob,sentiment_model_class
9,"Baby Tracker&reg; - Daily Childcare Journal, S...",This has been an easy way for my nanny to reco...,4,This has been an easy way for my nanny to reco...,1,0.923160,1
10,"Baby Tracker&reg; - Daily Childcare Journal, S...",I love this journal and our nanny uses it ever...,4,I love this journal and our nanny uses it ever...,1,1.000000,1
16,Nature\'s Lullabies First Year Sticker Calendar,"I love this little calender, you can keep trac...",5,I love this little calender you can keep track...,1,0.945891,1
20,Nature\'s Lullabies Second Year Sticker Calendar,I had a hard time finding a second year calend...,5,I had a hard time finding a second year calend...,1,0.999927,1
28,"Lamaze Peekaboo, I Love You","One of baby\'s first and favorite books, and i...",4,One of babys first and favorite books and it i...,1,0.978233,1
...,...,...,...,...,...,...,...
183507,Maxboost iPhone 5S/5 Case - Protective Snap-on...,got this for my wife and she loves it would de...,5,got this for my wife and she loves it would de...,1,0.976363,1
183515,Maxboost iPhone 5S/5 Case - Protective Snap-on...,I love this phone case! My iPhone is always ...,5,I love this phone case My iPhone is always s...,1,0.997089,1
183522,Airline Seat Belt Extender - The Best Extensio...,I bought this as a father\'s day gift for my d...,5,I bought this as a fathers day gift for my dad...,1,0.999979,1
183524,Squeasy Snacker 6oz Silicone Reusable Food Pou...,"I love this product, it makes my life easier. ...",5,I love this product it makes my life easier Wi...,1,0.999984,1


In [28]:
# Quiz 5
accuracy = sum(test_data['sentiment'] == test_data['sentiment_model_class'])/ len(test_data['sentiment'])
print('Accuracy on test set: ', f'{accuracy:.3f}')

Accuracy on test set:  0.932


#### 16. Classifier with fewer words

In [29]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [30]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

#### 17. Logistic regression model (subset)

In [31]:
simple_model = LogisticRegression().fit(X=train_matrix_word_subset, y=products['sentiment'].iloc[train_idx])

#### 18. Inspect model weights

In [32]:
simple_coefs = simple_model.coef_

# Demonstrate equivalence
print(simple_coefs.flatten())
print(simple_coefs[0])

[ 1.3637  0.944   1.1922  0.0854  0.5202  1.5103  1.6733  0.5038  0.1909  0.0588 -1.6521 -0.2093 -0.5115 -2.0345 -2.3485 -0.6213 -0.3205 -0.8981 -0.3622 -2.1098]
[ 1.3637  0.944   1.1922  0.0854  0.5202  1.5103  1.6733  0.5038  0.1909  0.0588 -1.6521 -0.2093 -0.5115 -2.0345 -2.3485 -0.6213 -0.3205 -0.8981 -0.3622 -2.1098]


In [33]:
simple_model_coef_table = pd.DataFrame({
    'word':significant_words,
    'coefficient':simple_model.coef_.flatten()
    }).sort_values('coefficient', ascending=False)

simple_model_coef_table

Unnamed: 0,word,coefficient
6,loves,1.673269
5,perfect,1.510263
0,love,1.363697
2,easy,1.192219
1,great,0.94395
4,little,0.520174
7,well,0.50376
8,able,0.190937
3,old,0.085424
9,car,0.058813


In [34]:
model_coef_table = pd.DataFrame({
    'word':vectorizer.get_feature_names_out(),
    'coefficient':coefficients.flatten()
    }).sort_values('coefficient', ascending=False)

model_coef_table

Unnamed: 0,word,coefficient
40372,excellent,2.412110
13287,awesome,2.091270
27348,complaints,2.078515
80942,pleased,2.068785
10118,amazing,2.049382
...,...,...
114477,useless,-2.583730
81610,poorly,-2.676401
27609,concept,-2.808718
34484,disappointing,-3.137688


In [35]:
all_coefs = pd.merge(simple_model_coef_table, model_coef_table, how='left', on='word', suffixes=("_simple", "_sentiment"))
all_coefs

Unnamed: 0,word,coefficient_simple,coefficient_sentiment
0,loves,1.673269,1.699774
1,perfect,1.510263,1.933978
2,love,1.363697,1.558259
3,easy,1.192219,1.344149
4,great,0.94395,1.255622
5,little,0.520174,0.558768
6,well,0.50376,0.478963
7,able,0.190937,0.3408
8,old,0.085424,0.00036
9,car,0.058813,0.123058


#### 19. Compare model accuracy (training data)

In [36]:
# Quiz 9
# compute the classification accuracy of the sentiment_model on the train_data
train_data['sentiment_model_class'] = sentiment_model.predict(train_matrix)

accuracy = sum(train_data['sentiment'] == train_data['sentiment_model_class'])/ len(train_data['sentiment'])
print('Sentiment model accuracy on train set: ', f'{accuracy:.3f}')

Sentiment model accuracy on train set:  0.948


In [37]:
# Quiz 9
# compute the classification accuracy of the simple_model on the train_data
train_data['simple_model_class'] = simple_model.predict(train_matrix_word_subset)

accuracy = sum(train_data['sentiment'] == train_data['simple_model_class'])/ len(train_data['sentiment'])
print('Simple model accuracy on train set: ', f'{accuracy:.3f}')

Simple model accuracy on train set:  0.867


#### 20. Compare model accuracy (test data)

In [38]:
# Quiz 10
# compute the classification accuracy of the sentiment_model on the test_data
accuracy = sum(test_data['sentiment'] == test_data['sentiment_model_class'])/ len(test_data['sentiment'])
print('Sentiment model accuracy on test set: ', f'{accuracy:.3f}')

Sentiment model accuracy on test set:  0.932


In [39]:
# Quiz 10
# compute the classification accuracy of the simple_model on the test_data
test_data['simple_model_class'] = simple_model.predict(test_matrix_word_subset)

accuracy = sum(test_data['sentiment'] == test_data['simple_model_class'])/ len(test_data['sentiment'])
print('Simple model accuracy on test set: ', f'{accuracy:.3f}')

Simple model accuracy on test set:  0.869


#### 21. Majority class prediction (baseline)

In [40]:
products.groupby('sentiment').size()

sentiment
-1     26493
 1    140259
dtype: int64

In [41]:
# Quiz 11
# compute the classification accuracy of the majority class prediction on the test_data
test_data['majority_class'] = 1
accuracy = sum(test_data['sentiment'] == test_data['majority_class']) / len(test_data['sentiment'])
print('Majority class classifier accuracy on test set: ', f'{accuracy:.3f}')

Majority class classifier accuracy on test set:  0.843
