In [9]:
import pandas as pd
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem.snowball import SnowballStemmer

[nltk_data] Downloading package stopwords to /Users/donor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/donor/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/donor/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
df = pd.DataFrame(pd.read_csv('reviews.csv'))
df = df.reset_index()
df.head()

Unnamed: 0,index,Rating,Year_Month,Reviewer_Location,Review_Text
0,0,5,2019-3,United Arab Emirates,"We've been to Disneyland Hongkong and Tokyo, s..."
1,1,4,2018-6,United Kingdom,I went to Disneyland Paris in April 2018 on Ea...
2,2,5,2019-4,United Kingdom,"What a fantastic place, the queues were decent..."
3,3,4,2019-4,Australia,We didn't realise it was school holidays when ...
4,4,5,missing,France,A Trip to Disney makes you all warm and fuzzy ...


In [5]:
df['year'] = df['Year_Month'].apply(lambda x : x.split('-')[0])

In [6]:
df

Unnamed: 0,index,Rating,Year_Month,Reviewer_Location,Review_Text,year
0,0,5,2019-3,United Arab Emirates,"We've been to Disneyland Hongkong and Tokyo, s...",2019
1,1,4,2018-6,United Kingdom,I went to Disneyland Paris in April 2018 on Ea...,2018
2,2,5,2019-4,United Kingdom,"What a fantastic place, the queues were decent...",2019
3,3,4,2019-4,Australia,We didn't realise it was school holidays when ...,2019
4,4,5,missing,France,A Trip to Disney makes you all warm and fuzzy ...,missing
...,...,...,...,...,...,...
13625,13625,5,missing,United Kingdom,i went to disneyland paris in july 03 and thou...,missing
13626,13626,5,missing,Canada,2 adults and 1 child of 11 visited Disneyland ...,missing
13627,13627,5,missing,South Africa,My eleven year old daughter and myself went to...,missing
13628,13628,4,missing,United States,"This hotel, part of the Disneyland Paris compl...",missing


In [10]:
REMPLACE_SANS_ESPACE = re.compile("[;:!\'?,\"()\[\]]")
REMPLACE_AVEC_ESPACE = re.compile("()|(\-)|(\/)|[.]")
PUR_NOMBRE = re.compile("[0-9]")

def preprocess(txt):
    txt = PUR_NOMBRE.sub("", txt) # retire les nombres (comme les années)
    txt.replace('\n', ' ')# Retire les \n (retours chariots)
    txt = REMPLACE_SANS_ESPACE.sub(" ", txt)
    txt = REMPLACE_AVEC_ESPACE.sub("", txt)
    txt = txt.lower()
    return txt

df['data'] = df['Review_Text'].apply(lambda x: preprocess(x))

In [None]:
stemmer = SnowballStemmer('english')
df['data'] = df['data'].apply(lambda w: stemmer.stem(w))

In [11]:
eng_stopwords = set(stopwords.words('english'))
filtre_stopen =  lambda text: [token for token in text if token.lower() not in eng_stopwords]
df['data'] = [' '.join(filtre_stopen(word_tokenize(item))) for item in df['data']]
df.head()

Unnamed: 0,index,Rating,Year_Month,Reviewer_Location,Review_Text,year,data
0,0,5,2019-3,United Arab Emirates,"We've been to Disneyland Hongkong and Tokyo, s...",2019,disneyland hongkong tokyo far one best looking...
1,1,4,2018-6,United Kingdom,I went to Disneyland Paris in April 2018 on Ea...,2018,went disneyland paris april easter weekend kno...
2,2,5,2019-4,United Kingdom,"What a fantastic place, the queues were decent...",2019,fantastic place queues decent best time year g...
3,3,4,2019-4,Australia,We didn't realise it was school holidays when ...,2019,realise school holidays went consequently extr...
4,4,5,missing,France,A Trip to Disney makes you all warm and fuzzy ...,missing,trip disney makes warm fuzzy actual kid big us...


In [16]:
# Polarisation des ratings :

def setClassBin(i):
    if i > 3:
        return 1
    else:
        return 0

df['polar'] = [setClassBin(x) for x in df.Rating]


bilan = df.groupby(['Rating','polar'])
bilan.size()

Rating  polar
1       0         828
2       0        1044
3       0        2083
4       1        3564
5       1        6111
dtype: int64

In [17]:

df.head()

Unnamed: 0,index,Rating,Year_Month,Reviewer_Location,Review_Text,year,data,polar
0,0,5,2019-3,United Arab Emirates,"We've been to Disneyland Hongkong and Tokyo, s...",2019,disneyland hongkong tokyo far one best looking...,1
1,1,4,2018-6,United Kingdom,I went to Disneyland Paris in April 2018 on Ea...,2018,went disneyland paris april easter weekend kno...,1
2,2,5,2019-4,United Kingdom,"What a fantastic place, the queues were decent...",2019,fantastic place queues decent best time year g...,1
3,3,4,2019-4,Australia,We didn't realise it was school holidays when ...,2019,realise school holidays went consequently extr...,1
4,4,5,missing,France,A Trip to Disney makes you all warm and fuzzy ...,missing,trip disney makes warm fuzzy actual kid big us...,1


In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


cv = CountVectorizer(binary=True)
cv.fit(df["data"])

CountVectorizer(binary=True)

In [14]:
X = df['data']

y = df['polar']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,stratify=y)

In [15]:
Xf_onehot = cv.transform(df['data'])
pd.DataFrame(Xf_onehot.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,38154,38155,38156,38157,38158,38159,38160,38161,38162,38163
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13625,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13626,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13627,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13628,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
