Product Workflow:
1. Split each flavor review into 5 star reviews and 1 or 2 star reviews
2. Build top 10 review adjectives for each flavor and score (220 total adjectives across 480 flavors)
3. Convert to sparse matrix
4. Use sparse matrix to create cosine similarity for high scoring reviews and low scoring reviews
5. Keep top 5 most relevent product recommendations (will likely include the opposite scored flavor)


Review table can be used to look for favored or disliked flavors and get the top recommendations

In [312]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk, random, re
import spacy
from collections import Counter
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/bill/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [252]:
nlp = spacy.load('en_core_web_sm')

In [3]:
reviews = pd.read_csv('reviews.csv')

In [4]:
flavors = pd.read_csv('products.csv')

In [178]:
reviews.head()

Unnamed: 0,brand,key,author,date,stars,title,helpful_yes,helpful_no,text,taste,ingredients,texture,likes
0,bj,0_bj,Ilovebennjerry,2017-04-15,3,Not enough brownies!,10.0,3.0,"Super good, don't get me wrong. But I came for...",,,,
1,bj,0_bj,Sweettooth909,2020-01-05,5,I’m OBSESSED with this pint!,3.0,0.0,I decided to try it out although I’m not a hug...,,,,
2,bj,0_bj,LaTanga71,2018-04-26,3,My favorite...More Caramel Please,5.0,2.0,My caramel core begins to disappear about half...,,,,
3,bj,0_bj,chicago220,2018-01-14,5,Obsessed!!!,24.0,1.0,Why are people complaining about the blonde br...,,,,
4,bj,0_bj,Kassidyk,2020-07-24,1,Worst Ice Cream Ever!,1.0,5.0,This ice cream is worst ice cream I’ve ever ta...,,,,


In [179]:
flavors.head()

Unnamed: 0,brand,key,name,subhead,description,rating,rating_count,ingredients
0,bj,0_bj,Salted Caramel Core,Sweet Cream Ice Cream with Blonde Brownies & a...,Find your way to the ultimate ice cream experi...,3.7,208,"CREAM, SKIM MILK, LIQUID SUGAR (SUGAR, WATER),..."
1,bj,1_bj,Netflix & Chilll'd™,Peanut Butter Ice Cream with Sweet & Salty Pre...,There’s something for everyone to watch on Net...,4.0,127,"CREAM, SKIM MILK, LIQUID SUGAR (SUGAR, WATER),..."
2,bj,2_bj,Chip Happens,A Cold Mess of Chocolate Ice Cream with Fudge ...,Sometimes “chip” happens and everything’s a me...,4.7,130,"CREAM, LIQUID SUGAR (SUGAR, WATER), SKIM MILK,..."
3,bj,3_bj,Cannoli,Mascarpone Ice Cream with Fudge-Covered Pastry...,As a Limited Batch that captured the rapture o...,3.6,70,"CREAM, SKIM MILK, LIQUID SUGAR (SUGAR, WATER),..."
4,bj,4_bj,Gimme S’more!™,Toasted Marshmallow Ice Cream with Chocolate C...,It’s a gimme: there’s always room for s’more. ...,4.5,281,"CREAM, SKIM MILK, WATER, LIQUID SUGAR (SUGAR, ..."


In [253]:
reviews['spacy_doc'] = list(nlp.pipe(reviews.text))

In [536]:
def make_list(counter_a):
    return " ".join([a for a,_ in counter_a.most_common(15)])

In [538]:
flavor_adj = {}

for flavor in reviews.key:
    high_ratings = []
    low_ratings = []
    for doc in reviews[(reviews.key == flavor) & (reviews.stars == 5)].spacy_doc:
        for token in doc:
            if token.pos_ == 'ADJ':
                high_ratings.append(token.text.lower())
    flavor_name = flavors[flavors.key==flavor].name.values[0]
    flavor_adj[flavor_name+' high'] = make_list(Counter(high_ratings))
    for doc in reviews[(reviews.key == flavor) & (reviews.stars.isin((1,2)))].spacy_doc:
        for token in doc:
            if token.pos_ == 'ADJ':
                low_ratings.append(token.text.lower())
    flavor_adj[flavor_name+' low'] = make_list(Counter(low_ratings))

In [539]:
flavor_adj

{'Salted Caramel Core high': 'sweet blonde favorite salty perfect best more good delicious amazing first great much different bad',
 'Salted Caramel Core low': 'salty disappointed other excited first last great blonde good worst bad half delicious much few',
 "Netflix & Chilll'd™ high": 'salty sweet perfect best amazing favorite good great more delicious other soft first enough little',
 "Netflix & Chilll'd™ low": 'salty sweet pretzel worst only good disappointed better mushy disgusting harder more creamy pure soft',
 'Chip Happens high': 'salty sweet perfect permanent best good crunchy favorite amazing delicious limited great new more sad',
 'Chip Happens low': 'good salty excited late crunchy many sweet horrible better disappointed special ample terrible dark first',
 'Cannoli high': 'delicious new good sweet favorite perfect best glad first big crunchy other different awesome classic',
 'Cannoli low': 'excited good favorite better sweet great strange little wrong horrible big strong

general comments on ice cream

In [428]:
high_ratings = []
low_ratings = []
for doc in reviews[(reviews.stars == 5)].spacy_doc:
    for token in doc:
        if token.pos_ == 'ADJ':
            high_ratings.append(token.text.lower())
for doc in reviews[(reviews.stars.isin((1,2)))].spacy_doc:
    for token in doc:
        if token.pos_ == 'ADJ':
            low_ratings.append(token.text.lower())
            

In [564]:
adj_cv_1 = pd.DataFrame(pd.DataFrame(flavor_adj, index=list(flavor_adj)).iloc[0])
adj_cv_1.columns = ['text']
adj_cv_1

Unnamed: 0,text
Salted Caramel Core high,sweet blonde favorite salty perfect best more ...
Salted Caramel Core low,salty disappointed other excited first last gr...
Netflix & Chilll'd™ high,salty sweet perfect best amazing favorite good...
Netflix & Chilll'd™ low,salty sweet pretzel worst only good disappoint...
Chip Happens high,salty sweet perfect permanent best good crunch...
...,...
Layered Dessert S'mores low,good excited odd marshmallow disappointed disa...
Layered Dessert Peach Cobbler high,peach best good little natural real amazing de...
Layered Dessert Peach Cobbler low,peach first artificial much regular other only...
Layered Dessert Brownie Cheesecake high,delicious favorite wonderful wild flavorful ri...


In [566]:
adj_cv = CountVectorizer()
flavor_adj_cv = adj_cv.fit_transform(adj_cv_1.text)
adj_cv_df = pd.DataFrame(flavor_adj_cv.todense(), index=list(flavor_adj), columns=adj_cv.get_feature_names_out())
adj_cv_df

Unnamed: 0,2nd,5he,able,aboid,about,absent,absolute,acrid,actual,addicted,...,wild,wonderful,worried,worst,worth,wrong,yea,yucky,yummiest,yummy
Salted Caramel Core high,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Salted Caramel Core low,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
Netflix & Chilll'd™ high,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Netflix & Chilll'd™ low,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
Chip Happens high,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Layered Dessert S'mores low,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
Layered Dessert Peach Cobbler high,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Layered Dessert Peach Cobbler low,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
Layered Dessert Brownie Cheesecake high,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0


In [567]:
cosine_matrix = cosine_similarity(adj_cv_df)

In [390]:
def map_flavor_name(x):
    return list(flavor_adj)[x]

convert products to top five similarily reviewed products by cosine similarity through adjectives used for 5 star reviews vs 1-3 star reviews

In [568]:
reco_flavor_id = pd.DataFrame(cosine_matrix.argsort()[:,-6:-1], index=flavor_adj, columns=['five','four','three','two','one'])

reco_flavor_id.one = reco_flavor_id.one.apply(map_flavor_name)
reco_flavor_id.two = reco_flavor_id.two.apply(map_flavor_name)
reco_flavor_id.three = reco_flavor_id.three.apply(map_flavor_name)
reco_flavor_id.four = reco_flavor_id.four.apply(map_flavor_name)
reco_flavor_id.five = reco_flavor_id.five.apply(map_flavor_name)

reco_flavor_id

Unnamed: 0,five,four,three,two,one
Salted Caramel Core high,VANILLA FUDGE COOKIE high,The Tonight Dough® high,Netflix & Chilll'd™ high,Everything But The...® high,SALTED CARAMEL TRUFFLE high
Salted Caramel Core low,Mint Chocolate Cookie low,New York Style Cheesecake high,Cookies & Cream Cheesecake Core low,CARAMEL COOKIE CRUNCH GELATO low,Salted Caramel Core high
Netflix & Chilll'd™ high,Oat of This Swirled™ high,ROMAN RASPBERRY SORBETTO high,Salted Caramel Core high,SALTED CARAMEL TRUFFLE high,CARAMEL COOKIE CRUNCH GELATO high
Netflix & Chilll'd™ low,OREO® high,OREO® low,Chubby Hubby® high,Green Tea Ice Cream high,Sweet Like Sugar Cookie Dough Core low
Chip Happens high,Oat of This Swirled™ high,Coffee Vanilla Chocolate TRIO CRISPY LAYERS high,S'mores high,Boots on the Moooo’n™ high,Gimme S’more!™ high
...,...,...,...,...,...
Layered Dessert S'mores low,Chocolate Chip Cookie Dough Core low,Rum Tres Leches Ice Cream low,The Tonight Dough® low,Gimme S’more!™ low,Chillin' the Roast™ low
Layered Dessert Peach Cobbler high,Natural Strawberry high,Vanilla Chocolate high,SEA SALT CARAMEL GELATO high,CHOCOLATE PEANUT BUTTER CUP GELATO high,Natural Vanilla high
Layered Dessert Peach Cobbler low,French Vanilla low,ORGANIC BROWN BUTTER CARAMEL low,New York Style Cheesecake low,Brownie Batter Core high,Peanut Butter Chip HEAVEN Light Ice Cream low
Layered Dessert Brownie Cheesecake high,Chocolate Fudge Brownie high,Peanut Butter Chocolate Fudge Non-Dairy high,Coconut Caramel Dark Chocolate Non-Dairy Bar high,COLD BREW COFFEE SORBETTO high,Vanilla Ice Cream high


In [569]:
reco_flavor_id.loc['Chocolate high']

five      Butter Pecan Ice Cream high
four             Natural Vanilla high
three                      OREO® high
two      Chocolate Peanut Butter high
one             Homemade Vanilla high
Name: Chocolate high, dtype: object

In [582]:
flavor_adj['Peanut Butter World® high']

'best favorite other perfect good first more new only whole rich amazing salty wonderful special'

In [584]:
flavor_adj['Karamel Sutra® Core high']

'best favorite other good only free amazing perfect new able first more decadent great rich'

In [585]:
flavor_adj['Coconut Caramel Chocolate TRIO CRISPY LAYERS high']

'favorite perfect only more wonderful new first best salty whole few amazing delicious belgian awesome'

In [586]:
flavor_adj['Chocolate Peanut Butter Ice Cream high']

'creamy delicious smooth favorite perfect best amazing more rich good other new hard sweet whole'

In [587]:
flavor_adj['S\'mores high']

'favorite best hard delicious other more new good perfect great sad amazing rich only sweet'

In [572]:
reco_flavor_id.loc['Milk & Cookies high']

five                     Chocolate Shake It™ high
four     Banana Peanut Butter Chip Ice Cream high
three                 SALTED CARAMEL TRUFFLE high
two                           Gimme S’more!™ high
one       Sweet Like Sugar Cookie Dough Core high
Name: Milk & Cookies high, dtype: object

In [573]:
reco_flavor_id.loc['Peanut Butter World® high']

five                                   Milk & Cookies high
four                                          S'mores high
three               Chocolate Peanut Butter Ice Cream high
two                               Karamel Sutra® Core high
one      Coconut Caramel Chocolate TRIO CRISPY LAYERS high
Name: Peanut Butter World® high, dtype: object

In [574]:
reco_flavor_id.loc['OREO® high']

five     Rum Raisin Ice Cream high
four          Natural Vanilla high
three        Coffee Ice Cream high
two          Homemade Vanilla high
one                 Chocolate high
Name: OREO® high, dtype: object

In [575]:
flavor_adj['Peanut Butter World® high']

'best favorite other perfect good first more new only whole rich amazing salty wonderful special'

In [576]:
flavor_adj['Peanut Butter Cup high']

'favorite best good perfect first amazing many sweet last delicious heavy full absolute only generous'

In [578]:
adj_cv_df.loc[['Peanut Butter Cup high','Peanut Butter World® high']]

Unnamed: 0,2nd,5he,able,aboid,about,absent,absolute,acrid,actual,addicted,...,wild,wonderful,worried,worst,worth,wrong,yea,yucky,yummiest,yummy
Peanut Butter Cup high,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Peanut Butter World® high,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [588]:
reco_flavor_id.loc['Vanilla Caramel high']

five     Double Belgian Chocolate Chip Ice Cream high
four                                 Half Baked® high
three             PACIFIC COAST PISTACHIO GELATO high
two                                    SNICKERS® high
one               MADAGASCAN VANILLA BEAN GELATO high
Name: Vanilla Caramel high, dtype: object

In [407]:
for flavor in reco_flavor_id.index:
    print(flavor)

Salted Caramel Core high
Salted Caramel Core low
Netflix & Chilll'd™ high
Netflix & Chilll'd™ low
Chip Happens high
Chip Happens low
Cannoli high
Cannoli low
Gimme S’more!™ high
Gimme S’more!™ low
Peanut Butter Half Baked® high
Peanut Butter Half Baked® low
Berry Sweet Mascarpone high
Berry Sweet Mascarpone low
Chocolate Peanut Butter Split high
Chocolate Peanut Butter Split low
Justice ReMix'd ™ high
Justice ReMix'd ™ low
Boots on the Moooo’n™ high
Boots on the Moooo’n™ low
Americone Dream® high
Americone Dream® low
Bourbon Pecan Pie high
Bourbon Pecan Pie low
Brewed to Matter™ high
Brewed to Matter™ low
Caramel Chocolate Cheesecake high
Caramel Chocolate Cheesecake low
Cherry Garcia® high
Cherry Garcia® low
Chillin' the Roast™ high
Chillin' the Roast™ low
Chocolate Chip Cookie Dough high
Chocolate Chip Cookie Dough low
Chocolate Fudge Brownie high
Chocolate Fudge Brownie low
Chocolate Shake It™ high
Chocolate Shake It™ low
Chocolate Therapy® high
Chocolate Therapy® low
Chubby Hubby® 

In [580]:
reco_flavor_id.to_csv('reco_table.csv')

# Section 1

In [5]:
X = reviews.text
y = reviews.stars

X = X.apply(lambda x: re.sub('[\W\d]',' ',x))
X = X.apply(lambda x: re.sub('  ',' ',x))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [182]:
corpus = list(X)
corpus

['Super good don t get me wrong But I came for the caramel and brownies not the sweet cream The packaging made it seem like brownies were packed and bountiful crying frowny emoji I d say the taste of this was amazing but the ratio of brownie to sweet cream was disappointing Liked it regardless but probably won t buy again simply because it didn t live up to its promising package I ll find another one that has a better ratio and wayyy more yummy chewy brownies Overall good flavor texture idea and brownies Not so great caramel sweet cream brownie RATIO Just add more brownies Please ',
 'I decided to try it out although I m not a huge caramel fan and the first buy was ok didn t like the caramel too much and for some reason that specific pint barely had any brownies Like there were some on top but from the middle to the bottom Zilch Nada Nothing It was disappointing But for some reason I bought it again and I really do believe it was just that point bcuz the second one was glorious  I had 

In [7]:
all_reviews = reviews.text.str.cat(sep=' ')
all_reviews = re.sub('[\W\d]',' ',all_reviews)
all_reviews = re.sub('  ',' ',all_reviews)

word_list = all_reviews.split(' ')
len(word_list)

965690

In [246]:
stops = list(stopwords.words('english'))
stops.extend(['ice','cream','flavor','like','love','taste','best','really','would','favorite',
              'chocolate', 'vanilla','good','disappointed','bought','product','great','perfect',
              'flavors','pint','ever','one','buy','amazing','find','time','try','get','eat','go',
              'tried','tastes','back','used','even','tasted','please','cookie','dough','coffee',
              'butter','breyers','peanut','caramel','delicious','ben','gelato','little','always',
              'every','never','much','definitely','store','new','jerry','make','still','bit','however',
              'way','cookies','brand','could','haagen','got','better','first','wish','dairy','eating',
              'think','loved','core','excited','thought','bad','something','changed','years','chip',
              'recommend','absolutely','treat','dazs','container','last','disappointing','milk','quality',
              'also','last','brownie','sure','fudge','though','free','jerrys','part','talenti','cheesecake',
              'swirl','right','well','stores','mint','well','fun','enough','buying','recipe','bars',
              'toffee','see','icecream','nothing','heath','lot','fan','day','far','since','half','found',
              'made','bar','many','creams','pretty','change','know','different','want','nice','two',
              'purchased','thank','almost','away','less','maybe','whole',
              'thing','review','bottom','overall','going','truffels','lime','keep','amount','feel',
              'yummy','makes','sugar','top','enjoy','worth','combination','stars','pints','say',
              'unfortunately','truffles','frozen','terrible','bite','real','noticed','us','reviewed',
              'another','enjoyed','times','strawberry','actually','rum','add','big','longer','old',
              'brands','reviews','size','past','super','base','bring','kind','seems','sad','money',
              'carton','low','waste','bean','awful','looking','pieces'])
cv = CountVectorizer(stop_words=stops)
tv = TfidfVectorizer(stop_words=stops)

In [247]:
X_train_cv = cv.fit_transform(X_train)
X_test_cv  = cv.transform(X_test)

X_train_tv = tv.fit_transform(X_train)
X_test_tv  = tv.transform(X_test)

In [248]:
dtm_cv_train = pd.DataFrame(X_train_cv.toarray(), columns=cv.get_feature_names_out())
tfidf_train = pd.DataFrame(X_train_tv.toarray(), columns=tv.get_feature_names_out())

dtm_cv_test = pd.DataFrame(X_test_cv.toarray(), columns=cv.get_feature_names_out())
tfidf_test = pd.DataFrame(X_test_tv.toarray(), columns=tv.get_feature_names_out())

In [187]:
dtm_cv

Unnamed: 0,aback,abd,able,absence,absoluely,absolute,absolutely,absolutly,abundant,acceptable,...,yrs,yuck,yuk,yum,yummiest,yumminess,yummmmmm,yummmy,yummy,yummyyyy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
910,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
911,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
912,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
913,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [188]:
tfidf

Unnamed: 0,aback,abd,able,absence,absoluely,absolute,absolutely,absolutly,abundant,acceptable,...,yrs,yuck,yuk,yum,yummiest,yumminess,yummmmmm,yummmy,yummy,yummyyyy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.318205,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
910,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
911,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
912,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
913,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Tried 4 Topics: 

CV: Creamy Vanilla, Vanilla, Ingredient Add-ins (chocolate, peanut butter, mint), Candy/Dessert Add-ins (chocolate chips, cookie dough, brownie)

TV: Vanilla, Chocolate, Caramel, Peanut Butter & Chocolate

In [238]:
nmf_cv = NMF(n_components=6, init='nndsvda')
nmf_cv.fit(X_train_cv)

nmf_tv = NMF(n_components=6, init='nndsvda')
nmf_tv.fit(X_train_tv)

NMF(init='nndsvda', n_components=6)

In [190]:
nmf_cv.components_

array([[0.00154652, 0.00322464, 0.02506273, ..., 0.        , 0.02774687,
        0.01074491],
       [0.        , 0.00085062, 0.00420683, ..., 0.        , 0.03054025,
        0.        ],
       [0.00865228, 0.        , 0.02025612, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.00180094, ..., 0.        , 0.0380592 ,
        0.        ],
       [0.00639114, 0.        , 0.00501064, ..., 0.00730877, 0.14611676,
        0.        ],
       [0.        , 0.00803176, 0.        , ..., 0.        , 0.02247607,
        0.        ]])

In [191]:
nmf_tv.components_

array([[0.00087865, 0.01056185, 0.01479143, ..., 0.        , 0.03579823,
        0.01200174],
       [0.00093569, 0.        , 0.        , ..., 0.        , 0.05076049,
        0.        ],
       [0.        , 0.00512427, 0.0018833 , ..., 0.        , 0.07237849,
        0.        ],
       [0.00263131, 0.        , 0.0025405 , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.01428494, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.02462252, 0.02692549,
        0.        ]])

In [14]:
topic_term_cv = nmf_cv.components_.round(3)
topic_term_tv = nmf_tv.components_.round(3)

topic_term_df_cv = pd.DataFrame(topic_term_cv,
                index = ["component_1", "component_2", "component_3", "component_4", "component_5", "component_6"],
                columns = cv.get_feature_names_out())

topic_term_df_tv = pd.DataFrame(topic_term_tv,
                index = ["component_1", "component_2", "component_3", "component_4", "component_5", "component_6"],
                columns = tv.get_feature_names_out())

In [103]:
topic_term_df_tv

Unnamed: 0,_extra_,aa,aaaaand,aaahhh,aaand,aah,aallllll,aarhus,aback,abandoned,...,zebra,zenith,zero,zest,zesty,zilch,zing,zip,zone,țhe
component_1,0.0,0.0,0.0,0.0,0.001,0.001,0.0,0.0,0.0,0.0,...,0.0,0.0,0.002,0.0,0.001,0.0,0.0,0.0,0.0,0.0
component_2,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.001,0.002,0.0,...,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.001
component_3,0.0,0.001,0.0,0.001,0.0,0.0,0.002,0.001,0.0,0.001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002,0.002,0.0
component_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.011,0.002,0.0,0.001,0.001,0.0,0.001,0.0
component_5,0.0,0.0,0.008,0.0,0.0,0.0,0.0,0.0,0.001,0.0,...,0.0,0.0,0.016,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_6,0.0,0.003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.005,0.0,0.0,0.0


In [11]:
def display_topics(model, feature_names, no_top_words, topic_names = None): 
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix + 1)
        else:
            print("\nTopic: ", topic_names[ix])
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
    print("\n")
    return model, feature_names, no_top_words

In [15]:
display_topics(nmf_cv, cv.get_feature_names_out(), 15)


Topic  1
chocolate, chip, mint, chips, dark, chunks, rich, white, fudge, marshmallow, raspberry, pieces, brownie, covered, bars

Topic  2
cookie, dough, core, chip, ben, chunks, pieces, jerrys, brownie, half, delicious, cookies, baked, oatmeal, sugar

Topic  3
vanilla, bean, breyers, natural, ingredients, haagen, creamy, recipe, better, always, real, quality, changed, dazs, breyer

Topic  4
butter, peanut, cups, pecan, cup, reese, pieces, swirls, swirl, chunks, breyers, found, hard, first, pretzels

Topic  5
creamy, delicious, caramel, gelato, sweet, dairy, texture, dazs, free, smooth, talenti, first, breyers, haagen, rich

Topic  6
ben, coffee, jerry, jerrys, toffee, always, new, chip, never, bar, store, much, heath, crunch, every




(NMF(init='nndsvda', n_components=6),
 array(['_extra_', 'aa', 'aaaaand', ..., 'zip', 'zone', 'țhe'],
       dtype=object),
 15)

In [16]:
display_topics(nmf_tv, tv.get_feature_names_out(), 15)


Topic  1
chocolate, mint, chip, chips, dark, chunks, white, marshmallow, raspberry, rich, fudge, covered, pieces, combination, brownie

Topic  2
dough, cookie, chunks, chip, core, brownie, pieces, amount, bites, brownies, mix, jerrys, enough, grainy, sugar

Topic  3
butter, peanut, pecan, cups, reese, chocolate, cup, swirls, swirl, breyers, pretzels, salty, banana, chunks, combination

Topic  4
ben, jerry, jerrys, always, baked, half, never, worth, far, wrong, little, brownie, brand, recommend, every

Topic  5
vanilla, bean, breyers, natural, creamy, always, ingredients, real, brand, breyer, quality, haagen, plain, smooth, classic

Topic  6
delicious, creamy, caramel, sweet, gelato, smooth, part, review, rich, texture, definitely, coffee, promotion, collected, recommend




(NMF(init='nndsvda', n_components=6),
 array(['_extra_', 'aa', 'aaaaand', ..., 'zip', 'zone', 'țhe'],
       dtype=object),
 15)

In [249]:
mnb_cv = MultinomialNB()
mnb_cv.fit(dtm_cv_train, y_train)
cv_score = mnb_cv.score(dtm_cv_test, y_test)

# gnb_tv = MultinomialNB()
# gnb_tv.fit(tfidf_train, y_train)
# tv_score = gnb_tv.score(tfidf_test, y_test)

In [188]:
print(cv_score)

# print(tv_score)


0.77439446366782


In [233]:
def show_top_words(vect, bayes, x=0):
    sort = bayes.feature_log_prob_[x].argsort()
    return [[vect.get_feature_names_out()[i],bayes.feature_log_prob_[x][i]] for i in sort[:-11:-1]]

In [48]:
 mnb_cv.feature_log_prob_[0]

array([-10.90074979, -10.90074979, -10.90074979, ..., -10.90074979,
       -10.90074979, -10.90074979])

In [47]:
mnb_cv.classes_

array([1, 2, 3, 4, 5])

### TODO: implement part of speech tagging to use only adjectives and nouns

In [250]:
for x in range(5):
    print('{} stars'.format(x+1))
    print(show_top_words(cv,mnb_cv,x))

1 stars
[['texture', -5.13319834537823], ['creamy', -5.525240433154254], ['hard', -5.782285536144145], ['sweet', -5.8537445001262896], ['chunks', -5.950908248579937], ['natural', -5.992580944980506], ['crunch', -5.992580944980506], ['chips', -6.117035119453911], ['bland', -6.192070305396825], ['purchase', -6.192070305396825]]
2 stars
[['texture', -5.468895318867729], ['creamy', -5.519667644241153], ['chunks', -5.68956668103655], ['sweet', -5.83552059365963], ['hard', -6.132772117127562], ['chips', -6.152190202984663], ['rich', -6.346346217425621], ['price', -6.446429675982603], ['bland', -6.473097923064765], ['anymore', -6.500496897252879]]
3 stars
[['texture', -5.328436131473369], ['chunks', -5.377707180480152], ['creamy', -5.387859551944169], ['sweet', -5.724331788565382], ['chips', -5.84569264556965], ['hard', -6.1441856341256464], ['swirls', -6.259254963910434], ['price', -6.5070911278150145], ['smooth', -6.538839826129595], ['needs', -6.571629648952586]]
4 stars
[['creamy', -4.683

# Section 2
## Redo Analysis with only 5 star ratings

In [140]:
X = reviews.text[reviews.stars==5]
y = reviews.stars[reviews.stars==5]

X = X.apply(lambda x: re.sub('[\W\d]',' ',x))
X = X.apply(lambda x: re.sub('  ',' ',x))


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [141]:
cv = CountVectorizer(stop_words='english', max_df=0.7)
tv = TfidfVectorizer(stop_words='english', max_df=0.7)

In [142]:
X_train_cv = cv.fit_transform(X_train)
X_test_cv  = cv.transform(X_test)

X_train_tv = tv.fit_transform(X_train)
X_test_tv  = tv.transform(X_test)

In [None]:
X_train_cv = cv.fit_transform(X_train)
X_test_cv  = cv.transform(X_test)

X_train_tv = tv.fit_transform(X_train)
X_test_tv  = tv.transform(X_test)

In [143]:
dtm_cv = pd.DataFrame(X_train_cv.toarray(), columns=cv.get_feature_names_out())
tfidf = pd.DataFrame(X_train_tv.toarray(), columns=tv.get_feature_names_out())

Tried 4 Topics:

In [144]:
nmf_cv = NMF(n_components=4, init='nndsvda')
nmf_cv.fit(dtm_cv)

nmf_tv = NMF(n_components=4, init='nndsvda')
nmf_tv.fit(tfidf)

NMF(init='nndsvda', n_components=4)

In [146]:
topic_term_cv = nmf_cv.components_.round(3)
topic_term_tv = nmf_tv.components_.round(3)

topic_term_df_cv = pd.DataFrame(topic_term_cv,
                index = ["component_1", "component_2", "component_3", "component_4"],
                columns = cv.get_feature_names_out())

topic_term_df_tv = pd.DataFrame(topic_term_tv,
                index = ["component_1", "component_2", "component_3", "component_4"],
                columns = tv.get_feature_names_out())

In [150]:
display_topics(nmf_cv, cv.get_feature_names_out(), 10)


Topic  1
ice, cream, best, vanilla, like, favorite, good, breyers, creamy, just

Topic  2
flavor, favorite, like, just, best, time, ve, flavors, ben, new

Topic  3
chocolate, cookie, dough, peanut, butter, like, chip, perfect, just, vanilla

Topic  4
love, taste, flavors, good, creamy, just, product, absolutely, gelato, great




(NMF(init='nndsvda', n_components=4),
 array(['_____', '_extra_', 'aa', ..., 'zip', 'zone', 'țhe'], dtype=object),
 10)

In [149]:
display_topics(nmf_tv, tv.get_feature_names_out(), 10)


Topic  1
cream, ice, best, vanilla, ve, eat, breyers, brand, better, creams

Topic  2
dough, cookie, ben, jerrys, chocolate, brownie, chip, chunks, jerry, good

Topic  3
love, chocolate, taste, creamy, good, great, like, delicious, perfect, just

Topic  4
flavor, favorite, time, new, ben, jerry, flavors, tried, ve, far




(NMF(init='nndsvda', n_components=4),
 array(['_____', '_extra_', 'aa', ..., 'zip', 'zone', 'țhe'], dtype=object),
 10)

# Section 3
## Vanilla Only Flavors
- 47_bj
- 59_hd
- 61_hd
- 0_breyers
- 1_breyers
- 2_breyers
- 10_breyers

In [156]:
X = reviews.text[reviews.key.isin(['47_bj','59_hd','61_hd','0_breyers','1_breyers','2_breyers','10_breyers'])]
y = reviews.stars[reviews.key.isin(['47_bj','59_hd','61_hd','0_breyers','1_breyers','2_breyers','10_breyers'])]

X = X.apply(lambda x: re.sub('[\W\d]',' ',x))
X = X.apply(lambda x: re.sub('  ',' ',x))


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [157]:
cv = CountVectorizer(stop_words='english', max_df=0.7)
tv = TfidfVectorizer(stop_words='english', max_df=0.7)

In [158]:
X_train_cv = cv.fit_transform(X_train)
X_test_cv  = cv.transform(X_test)

X_train_tv = tv.fit_transform(X_train)
X_test_tv  = tv.transform(X_test)

In [159]:
dtm_cv = pd.DataFrame(X_train_cv.toarray(), columns=cv.get_feature_names_out())
tfidf = pd.DataFrame(X_train_tv.toarray(), columns=tv.get_feature_names_out())

In [160]:
nmf_cv = NMF(n_components=4, init='nndsvda')
nmf_cv.fit(dtm_cv)

nmf_tv = NMF(n_components=4, init='nndsvda')
nmf_tv.fit(tfidf)



NMF(init='nndsvda', n_components=4)

In [161]:
topic_term_cv = nmf_cv.components_.round(3)
topic_term_tv = nmf_tv.components_.round(3)

topic_term_df_cv = pd.DataFrame(topic_term_cv,
                index = ["component_1", "component_2", "component_3", "component_4"],
                columns = cv.get_feature_names_out())

topic_term_df_tv = pd.DataFrame(topic_term_tv,
                index = ["component_1", "component_2", "component_3", "component_4"],
                columns = tv.get_feature_names_out())

In [162]:
display_topics(nmf_cv, cv.get_feature_names_out(), 10)


Topic  1
ice, cream, love, best, family, creamy, delicious, favorite, brand, eat

Topic  2
vanilla, flavor, bean, creamy, haagen, dazs, favorite, best, rich, tried

Topic  3
taste, like, love, creamy, great, good, just, haagen, dazs, tastes

Topic  4
breyers, natural, vanilla, ingredients, like, gum, sugar, product, used, real




(NMF(init='nndsvda', n_components=4),
 array(['aback', 'abd', 'able', ..., 'yummmy', 'yummy', 'yummyyyy'],
       dtype=object),
 10)

In [163]:
display_topics(nmf_tv, tv.get_feature_names_out(), 10)


Topic  1
cream, ice, breyers, natural, best, vanilla, favorite, hands, ingredients, loved

Topic  2
love, great, taste, recommend, creamy, delicious, flavor, amazing, good, smooth

Topic  3
vanilla, flavor, bean, creamy, haagen, favorite, dazs, perfect, delicious, rich

Topic  4
like, taste, good, tastes, really, just, homemade, ice, cream, buy




(NMF(init='nndsvda', n_components=4),
 array(['aback', 'abd', 'able', ..., 'yummmy', 'yummy', 'yummyyyy'],
       dtype=object),
 10)