In [18]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.feature_extraction import text

import pickle

In [19]:
DATASET_FILE = "/content/dataset_cleaned.csv"
df = pd.read_csv(DATASET_FILE)
df = df[df.stars < 3]

In [20]:
# Add additional stop words since we are recreating the document-term matrix
stop_noun = ['today', 'thing']
stop_words_noun_agg = text.ENGLISH_STOP_WORDS.union(stop_noun)

# Create a document-term matrix with only nouns
# Store TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words=stop_words_noun_agg, ngram_range = (1,1), max_df = .8, min_df = .02)
# Fit and Transform speech noun text to a TF-IDF Doc-Term Matrix
data = vectorizer.fit_transform(df.text_cleaned)
# Create data-frame of Doc-Term Matrix with nouns as column names
matrix_df = pd.DataFrame(data.toarray(), columns=vectorizer.get_feature_names())
# Set President's Names as Index
matrix_df.index = df.index
# Visually inspect Document Term Matrix
matrix_df

Unnamed: 0,10,12,15,20,25,30,40,45,50,able,...,wish,woman,work,worst,worth,write,wrong,year,yelp,yes
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.16964,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4,0.450985,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00000,0.0,0.000000,0.0,0.0,0.0,0.516634,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
9996,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00000,0.0,0.093894,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
9997,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
9998,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [21]:
def display_topics(model, feature_names, num_top_words,topic_names=None):
# Given an NMF model, feature_names, and number of top words, print topic number and its top feature names, up to specified number of top words.
    # iterate through topics in topic-term matrix, 'H' aka
    for ix, topic in enumerate(model.components_):
        #print topic, topic number, and top words
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i] \
             for i in topic.argsort()[:-num_top_words - 1:-1]]))

In [22]:
topics =  ['Price & Portion', 'Waiting Time', 'Pizza', 'Customer Service', 'Food Quality', 'Burger', 'Ordering & Delivery to table', 'Place Environnement', 
           'Chicken', 'Drinks', 'Sandwich', 'Mexican Taco', 'Location', 'Sushi and Rice', 'Taste & Experience']

nmf_model = NMF(15)
doc_topic = nmf_model.fit_transform(matrix_df)
display_topics(nmf_model, vectorizer.get_feature_names(), 10, topics)  




Topic: ' Price & Portion '
like, taste, sauce, dish, look, really, flavor, bad, bland, make

Topic: ' Waiting Time '
wait, minute, hour, seat, 30, table, 15, 20, 10, min

Topic: ' Pizza '
pizza, cheese, slice, delivery, cold, eat, say, sauce, deliver, soggy

Topic: ' Customer Service '
service, bad, customer, slow, horrible, terrible, rude, poor, server, location

Topic: ' Food Quality '
food, restaurant, quality, cold, mediocre, eat, price, great, chinese, overprice

Topic: ' Burger '
burger, fry, onion, cheese, cook, cold, eat, location, greasy, beer

Topic: ' Ordering & Delivery to table '
order, delivery, wrong, tell, drink, item, deliver, ask, half, receive

Topic: ' Place Environnement '
place, really, try, people, love, want, look, staff, close, star

Topic: ' Chicken '
chicken, rice, fry, wing, sauce, dry, piece, fried, beef, bean

Topic: ' Drinks '
bar, drink, beer, bartender, night, friend, sit, great, area, music

Topic: ' Sandwich '
good, price, pretty, menu, really, coffe



In [23]:
df_topic = pd.DataFrame(doc_topic, columns = topics)

doc_topic.shape
reviews = []
for topic in topics:
  reviews.append(df_topic.nlargest(1,topic).index.values[0])
print(reviews)

[5838, 3413, 5484, 6618, 8377, 6270, 4146, 123, 9130, 6987, 5511, 7375, 3719, 9041, 3596]


In [24]:
with open('model_amine','wb') as file:
  pickle.dump(nmf_model, file)
with open('vectorizer_amine','wb') as file:
  pickle.dump(vectorizer, file)