In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

from wordcloud import WordCloud

import nltk
nltk.download('wordnet')
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
!pip install contractions
import contractions
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
stop_words=nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 1. Exploration du jeu de données

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
DATASET_FILE = "/content/drive/MyDrive/S9/IAAppliquee/NLP/dataset.csv"
dataset_df = pd.read_csv(DATASET_FILE)
dataset_df

Unnamed: 0,text,stars
0,I've only had food from here once and it wasn'...,1
1,I will never return here again. Ever. I was ...,1
2,I wish my experience was great as others. I di...,1
3,Are the rosemary grapefruit scones supposed to...,1
4,Our takeout order was half wrong. Food was mis...,1
...,...,...
24995,I was a loyal fan of Aroy before the ownership...,5
24996,Stopped here for a bite while wandering around...,5
24997,"A quiet place with excellent food, great music...",5
24998,Super delicious food. Awesome vibe. I suffered...,5


## 1.1 Répartition des avis clients en fonction du nombre d'étoiles

In [5]:
stars_columns = dataset_df.stars
stars_columns

0        1
1        1
2        1
3        1
4        1
        ..
24995    5
24996    5
24997    5
24998    5
24999    5
Name: stars, Length: 25000, dtype: int64

In [None]:
stars_columns.value_counts()

In [None]:
stars_columns.value_counts().plot(kind="pie", figsize=(10, 8), autopct="%1.1f%%", shadow=True)

## 1.2 Distribution de la longueur (nombre de mots) des avis clients

In [None]:
dataset_df["length"] = dataset_df["text"].apply(lambda x: len(x.split()))
dataset_df

In [None]:
dataset_df.length.plot(kind="hist", bins=100, figsize=(10, 8))

## 1.3 Distribution de la longueur des avis clients en fonction du nombre d'étoiles des avis

In [None]:
plt.figure(figsize=(10, 8))

ax = sns.boxplot(x=dataset_df.stars ,
            y=dataset_df.length,
            showmeans=True,
            )
ax.set_ylim(0, 400)

ax.set_title("Répartition des longueurs des avis en fonction du nombre d'étoiles")

# 2. Pré-traitement du jeu de données

Objectifs:
- Tokenization
- Normalisation du texte 
- Lemmatisation
- Effacement des stop words (pas les négatifs)

In [11]:
#Nouveau dataframe post traitement 
df_post_traitement = pd.DataFrame()

Tokenisation, stop words et normalisation

In [12]:
#enlever no, not, nor, only, don't, aren't, didn, didn't, doesn, doesn't, hasn, hasn"hasn't", 'haven', "haven't", 'isn', "isn't", 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"
stop_words_a_enlever = ['no', 'not', 'nor', "don't", "aren't", 'didn', "didn't", 'doesn', "doesn't",'aren', 'couldn', "couldn't", 'hadn', "hadn't", 'hasn',"hasn't", 'haven', "haven't", 'isn', "isn't", 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
stop_words_sans_neg = [x for x in stop_words if x not in stop_words_a_enlever]

print(stop_words_sans_neg)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', '

In [16]:
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')

def tokenization(text): 
  return tokenizer.tokenize(contractions.fix(text.lower()))

df_post_traitement['text_token']= dataset_df.text.apply(tokenization)

def stop_words(tokens):
  return [x for x in tokens if x not in stop_words_sans_neg]

df_post_traitement['text_token_stop_words']= df_post_traitement.text_token.apply(stop_words)
df_post_traitement.head()

Unnamed: 0,text_token,text_token_stop_words
0,"[I, have, only, had, food, from, here, once, a...","[I, food, not, memorable, ., panang, curry, ba..."
1,"[i, will, never, return, here, again, ., ever,...","[never, return, ., ever, ., sitting, booth, wa..."
2,"[i, wish, my, experience, was, great, as, othe...","[wish, experience, great, others, ., dined, we..."
3,"[are, the, rosemary, grapefruit, scones, suppo...","[rosemary, grapefruit, scones, supposed, taste..."
4,"[our, takeout, order, was, half, wrong, ., foo...","[takeout, order, half, wrong, ., food, missing..."


Pos-Tagging

In [17]:
def pos_tagging(tokens):
  return nltk.pos_tag(tokens)

df_post_traitement['text_pos_tag'] = df_post_traitement.text_token.apply(pos_tagging)

df_post_traitement.head()

Unnamed: 0,text_token,text_token_stop_words,text_pos_tag
0,"[I, have, only, had, food, from, here, once, a...","[I, food, not, memorable, ., panang, curry, ba...","[(I, PRP), (food, NN), (not, RB), (memorable, ..."
1,"[i, will, never, return, here, again, ., ever,...","[never, return, ., ever, ., sitting, booth, wa...","[(never, RB), (return, NN), (., .), (ever, RB)..."
2,"[i, wish, my, experience, was, great, as, othe...","[wish, experience, great, others, ., dined, we...","[(wish, JJ), (experience, NN), (great, JJ), (o..."
3,"[are, the, rosemary, grapefruit, scones, suppo...","[rosemary, grapefruit, scones, supposed, taste...","[(rosemary, JJ), (grapefruit, NN), (scones, NN..."
4,"[our, takeout, order, was, half, wrong, ., foo...","[takeout, order, half, wrong, ., food, missing...","[(takeout, IN), (order, NN), (half, NN), (wron..."


Lemmatisation

In [18]:
lem = WordNetLemmatizer()

def lemmatisation(tokens_tag):
  liste = list()
  for word, tag in tokens_tag:
    if tag.startswith('J'):
      liste.append(lem.lemmatize(word, 'a'))
    elif tag.startswith('V'):
      liste.append(lem.lemmatize(word, 'v'))
    elif tag.startswith('N'):
      liste.append(lem.lemmatize(word, 'n'))
    elif tag.startswith('R'):
      liste.append(lem.lemmatize(word, 'r'))
    else : 
      liste.append(lem.lemmatize(word))
  return " ".join(liste)

In [20]:
liste = df_post_traitement.text_pos_tag.apply(lemmatisation)
df_post_traitement['text_lem'] = pd.DataFrame(liste)
df_post_traitement[['text_token_stop_words','text_lem']].head()

Unnamed: 0,text_token_stop_words,text_lem
0,"[I, food, not, memorable, ., panang, curry, ba...",I food not memorable . panang curry balance fl...
1,"[never, return, ., ever, ., sitting, booth, wa...",never return . ever . sit booth wait dinner co...
2,"[wish, experience, great, others, ., dined, we...",wish experience great others . din wednesday n...
3,"[rosemary, grapefruit, scones, supposed, taste...",rosemary grapefruit scone suppose taste like w...
4,"[takeout, order, half, wrong, ., food, missing...","takeout order half wrong . food miss , portion..."


Prendre les 50 environ mots les plus fréquents pour les ajouter aux stop_words 