In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#1. Importation Dataset pré-traité

In [3]:
DATASET_FILE = "/content/drive/MyDrive/NLP I3/dataset_cleaned.csv"

In [4]:
df = pd.read_csv(DATASET_FILE)

In [5]:
df

Unnamed: 0,text,stars,length,text_cleaned
0,I've only had food from here once and it wasn'...,1,68,food memorable panang curry balance flavor lik...
1,I will never return here again. Ever. I was ...,1,87,NOT_return ever sit booth wait dinner come scu...
2,I wish my experience was great as others. I di...,1,166,wish experience great others din wednesday nig...
3,Are the rosemary grapefruit scones supposed to...,1,81,rosemary grapefruit scone suppose taste like w...
4,Our takeout order was half wrong. Food was mis...,1,32,takeout order half wrong food miss portion siz...
...,...,...,...,...
24995,I was a loyal fan of Aroy before the ownership...,5,75,loyal fan aroy ownership change apprehensive v...
24996,Stopped here for a bite while wandering around...,5,55,stopped bite wander around faneuil hall pleasa...
24997,"A quiet place with excellent food, great music...",5,32,quiet place excellent food great music helpful...
24998,Super delicious food. Awesome vibe. I suffered...,5,41,super delicious food awesome vibe suffer disne...


In [6]:
df_neg=df[df["stars"].isin([1,2])]

In [7]:
df_neg

Unnamed: 0,text,stars,length,text_cleaned
0,I've only had food from here once and it wasn'...,1,68,food memorable panang curry balance flavor lik...
1,I will never return here again. Ever. I was ...,1,87,NOT_return ever sit booth wait dinner come scu...
2,I wish my experience was great as others. I di...,1,166,wish experience great others din wednesday nig...
3,Are the rosemary grapefruit scones supposed to...,1,81,rosemary grapefruit scone suppose taste like w...
4,Our takeout order was half wrong. Food was mis...,1,32,takeout order half wrong food miss portion siz...
...,...,...,...,...
9995,Never order Hot Pot here. Too much vegetables ...,2,17,NOT_order hot pot much vegetable NOT_enough me...
9996,I work at BMC and heard about this new place o...,2,253,work bmc hear new place open co worker decide ...
9997,"Went here for dinner, drinks were very good, f...",2,150,went dinner drink good food could give well or...
9998,The food was mediocre. Not horrible not great....,2,64,food mediocre NOT_horrible NOT_great sausage b...


#2. Vectorisation des reviews négatives

In [8]:
vectorizer = TfidfVectorizer(max_df = .8, min_df = .02)
X = vectorizer.fit_transform(df_neg['text_cleaned'])
Y = vectorizer.get_feature_names()



In [9]:
print(X)

  (0, 402)	0.240155046184614
  (0, 342)	0.13836940429801894
  (0, 498)	0.203547374711164
  (0, 88)	0.26396228572628294
  (0, 23)	0.2703976343428603
  (0, 128)	0.19699174014065418
  (0, 330)	0.30890447591564985
  (0, 66)	0.30454158256143943
  (0, 448)	0.30797369406296393
  (0, 495)	0.2224788498323866
  (0, 315)	0.2818427151755024
  (0, 235)	0.30009308938865187
  (0, 453)	0.20789666837411044
  (0, 246)	0.15467116906103293
  (0, 169)	0.2745980900927676
  (0, 171)	0.23579555343731184
  (1, 119)	0.28961264859569
  (1, 101)	0.2684366129308462
  (1, 354)	0.26187962797518227
  (1, 316)	0.25501666486354624
  (1, 394)	0.14175069421424802
  (1, 313)	0.19813311025993985
  (1, 227)	0.26706624294573295
  (1, 180)	0.23926817051012236
  (1, 242)	0.17741049202115672
  :	:
  (9998, 287)	0.1822258745942362
  (9998, 351)	0.19841116039073448
  (9998, 194)	0.2825123808181976
  (9998, 387)	0.22118333304090643
  (9998, 53)	0.2298805958119867
  (9998, 471)	0.23710920040869407
  (9998, 187)	0.11675968403083943


In [10]:
print(Y)

['10', '12', '15', '20', '25', '30', '40', '45', '50', 'able', 'absolutely', 'across', 'actually', 'add', 'ago', 'almost', 'already', 'also', 'although', 'always', 'amount', 'another', 'anyone', 'anything', 'apologize', 'appetizer', 'area', 'around', 'arrive', 'ask', 'atmosphere', 'attention', 'attitude', 'average', 'avoid', 'away', 'awful', 'back', 'bad', 'bar', 'bartender', 'base', 'basically', 'bean', 'beef', 'beer', 'behind', 'believe', 'best', 'big', 'bill', 'bit', 'bite', 'bland', 'boston', 'bother', 'bowl', 'bread', 'breakfast', 'bring', 'burger', 'business', 'busy', 'buy', 'call', 'card', 'care', 'chance', 'change', 'charge', 'cheap', 'check', 'cheese', 'chef', 'chicken', 'chinese', 'chip', 'choice', 'choose', 'clean', 'clearly', 'close', 'coffee', 'cold', 'come', 'complain', 'completely', 'consider', 'cook', 'cool', 'cost', 'could', 'counter', 'couple', 'course', 'cream', 'crowd', 'cup', 'customer', 'cut', 'day', 'deal', 'decent', 'decide', 'decor', 'definitely', 'delicious', 

#3. Affichage Matrice TF-IDF

In [11]:
tf_idf = pd.DataFrame(X.toarray(), columns=Y)
tf_idf

Unnamed: 0,10,12,15,20,25,30,40,45,50,able,...,work,worst,worth,would,write,wrong,year,yelp,yes,yet
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.119721,0.0,0.000000,0.0,0.0,0.0,0.0
4,0.414369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.143432,0.0,0.474688,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
9996,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.086631,0.0,0.0,0.118069,0.0,0.000000,0.0,0.0,0.0,0.0
9997,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.073215,0.0,0.000000,0.0,0.0,0.0,0.0
9998,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0


#4. MNF

In [12]:
nmf_model = NMF(15)#à exporter en pickle
doc_topic = nmf_model.fit_transform(tf_idf)#à exporter en pickle



**4.A. Affichage Matrice Reviews x Topics**

In [13]:
rev_top=pd.DataFrame(doc_topic)
print(rev_top)

            0         1         2         3         4         5         6   \
0     0.000000  0.063201  0.000000  0.000000  0.062578  0.000000  0.003024   
1     0.032413  0.000000  0.000000  0.000000  0.000000  0.029828  0.000000   
2     0.021986  0.020304  0.000000  0.002862  0.029840  0.000000  0.000000   
3     0.008395  0.074779  0.000000  0.000000  0.000000  0.000000  0.000000   
4     0.000000  0.000000  0.000000  0.051118  0.028484  0.016141  0.000000   
...        ...       ...       ...       ...       ...       ...       ...   
9995  0.000000  0.039509  0.000934  0.000000  0.000000  0.000000  0.002945   
9996  0.020313  0.061765  0.062999  0.008651  0.000000  0.000000  0.001040   
9997  0.037922  0.044536  0.000000  0.035331  0.040529  0.000000  0.000000   
9998  0.000000  0.033912  0.002837  0.000000  0.036302  0.000000  0.000000   
9999  0.000000  0.008590  0.000000  0.001726  0.023864  0.000000  0.000384   

            7         8         9         10        11        1

4.B. Affichage Matrice Topics x Words

In [14]:
top_wor=pd.DataFrame(nmf_model.components_)
print(top_wor)

         0         1         2         3         4         5         6    \
0   0.124263  0.049748  0.036845  0.082604  0.043708  0.088055  0.007174   
1   0.000000  0.028213  0.000000  0.000000  0.008002  0.000000  0.000000   
2   0.000000  0.000000  0.000000  0.002663  0.010943  0.034132  0.020094   
3   0.069897  0.003615  0.049016  0.077789  0.020969  0.029039  0.017783   
4   0.000000  0.014321  0.000000  0.000000  0.000000  0.000000  0.014183   
5   0.478501  0.037998  0.535654  0.508579  0.131408  0.580761  0.214910   
6   0.000000  0.006397  0.000000  0.000000  0.007683  0.000000  0.006522   
7   0.038575  0.014851  0.000000  0.009326  0.004223  0.000000  0.000000   
8   0.043898  0.038614  0.000000  0.000000  0.000000  0.000000  0.000000   
9   0.083642  0.031909  0.017497  0.015585  0.008919  0.000000  0.043360   
10  0.100755  0.006729  0.113124  0.059803  0.032934  0.032651  0.001105   
11  0.010303  0.000000  0.000000  0.000000  0.005657  0.000000  0.000000   
12  0.000000

#5. Affichage des 15 topics pour les mots

In [15]:
def display_topics_words(model, feature_names, num_top_words, topic_names=None):
    display_topics=[]
    topic_list=[]
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i] \
             for i in topic.argsort()[:-num_top_words - 1:-1]]))

In [16]:
display_topics_words(nmf_model, Y, 10)


Topic  0
say, tell, go, would, give, call, ask, get, make, manager

Topic  1
good, like, taste, really, sauce, dish, flavor, pretty, much, well

Topic  2
pizza, cheese, slice, delivery, good, call, cold, sauce, eat, deliver

Topic  3
order, take, delivery, wrong, call, get, deliver, item, come, half

Topic  4
food, restaurant, price, quality, good, mediocre, cold, go, chinese, great

Topic  5
wait, minute, hour, 30, 15, 20, get, 10, seat, long

Topic  6
burger, fry, onion, cheese, get, cook, well, good, cold, eat

Topic  7
place, really, go, like, get, look, try, love, people, want

Topic  8
chicken, rice, fry, wing, sauce, dry, piece, fried, eat, salad

Topic  9
bar, drink, beer, bartender, night, go, friend, sit, get, area

Topic  10
table, come, server, waitress, ask, waiter, take, sit, seat, restaurant

Topic  11
time, last, go, first, location, every, second, visit, year, always

Topic  12
service, bad, slow, customer, ever, terrible, horrible, rude, poor, experience

Topic  13
s

#6. Affichage des 15 topics pour les reviews

In [17]:
def display_topics_reviews(model, num_top_words, topic_names=None): 
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")

        print("\n".join([df_neg["text"][i] \
             for i in topic.argsort()[:-num_top_words - 1:-1]]))

In [18]:
display_topics_reviews(nmf_model, 3)


Topic  0
Really enjoyed the original, the fire seems like it took the spark out of the quality of food here. If you are expecting the cozy restaurant feel you are in for a rude awakening. It feels like Yuppies came in and redid the whole place including the taste of the food. I loved this place gave, had no problem giving them my money, but when you walk into this new cold feeling yuppie place you will understand, unless you never went before. The prices are sky high probably cause they know yuppies from California have no problem paying them. The non-MSG is not what is making this place suck it is the rudeness of the employees and owners who know feel like they operate a Panda Express. Sorry but I used to love this place, no longer with the quality of employees and food.
I ordered the vegetarian ramen and asked for tofu to be added. What I received was a lukewarm vegetable broth that tasted exactly like canned vegetable broth from the grocery store (gross) and COLD, UNCOOKED tofu in 

#7. Définition Labels

In [19]:
lablels = {
    #
    0 : "" ,
    #
    1 : "" ,
    #
    2 : "" ,
    #
    3 : "" ,
    #
    4 : "" ,
    #
    5 : "Trop d'attente" ,
    #
    6 : "" ,
    #
    7 : "" ,
    #
    8 : "" ,
    #
    9 : "" ,
    #
    10 : "" ,
    #
    11 : "" ,
    #
    12 : "" ,
    #
    13 : "Pas de prise en considération des commandes" ,
    #
    14 : "Pas de réponse téléphone" 

}

#8 Extractions

In [20]:
import pickle
with open('model_file', 'wb') as file:
    pickle.dump(nmf_model, file)

In [21]:
with open('vectoriseur_file', 'wb') as file:
  pickle.dump(vectorizer,file)