In [3]:
### Données
import pandas as pd

df = pd.read_csv("MovieReview.csv")
display(df.head())
print(df.shape)

df = df.drop('sentiment', axis=1)

### nettoyer les données et supprimer les stopwords

import re
import unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download()
stop_words = stopwords.words('english')

# Converts the unicode file to ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!]+", " ", w)
    w = re.sub(r'\b\w{0,2}\b', '', w)

    # remove stopword
    mots = word_tokenize(w.strip())
    mots = [mot for mot in mots if mot not in stop_words]
    return ' '.join(mots).strip()

df.review = df.review.apply(lambda x :preprocess_sentence(x))
df.head()

Unnamed: 0,sentiment,review
0,Positive,With all this stuff going down at the moment w...
1,Positive,'The Classic War of the Worlds' by Timothy Hin...
2,Negative,The film starts with a manager (Nicholas Bell)...
3,Negative,It must be assumed that those who praised this...
4,Positive,Superbly trashy and wondrously unpretentious 8...


(25000, 2)
showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


Unnamed: 0,review
0,stuff going moment started listening music wat...
1,classic war worlds timothy hines entertaining ...
2,film starts manager nicholas bell giving welco...
3,must assumed praised film greatest filmed oper...
4,superbly trashy wondrously unpretentious explo...


In [4]:
### Tokens

import tensorflow as tf
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df.review)

word2idx = tokenizer.word_index
idx2word = tokenizer.index_word
vocab_size = tokenizer.num_words

### Créer l'ensemble de données (X, Y)
import numpy as np


def sentenceToData(tokens, WINDOW_SIZE):
    window = np.concatenate((np.arange(-WINDOW_SIZE,0),np.arange(1,WINDOW_SIZE+1)))
    X,Y=([],[])
    for word_index, word in enumerate(tokens) :
        if ((word_index - WINDOW_SIZE >= 0) and (word_index + WINDOW_SIZE <= len(tokens) - 1)) :
            X.append(word)
            Y.append([tokens[word_index-i] for i in window])
    return X, Y


WINDOW_SIZE = 5

X, Y = ([], [])
for review in df.review:
    for sentence in review.split("."):
        word_list = tokenizer.texts_to_sequences([sentence])[0]
        if len(word_list) >= WINDOW_SIZE:
            Y1, X1 = sentenceToData(word_list, WINDOW_SIZE//2)
            X.extend(X1)
            Y.extend(Y1)
    
X = np.array(X).astype(int)
y = np.array(Y).astype(int).reshape([-1,1])


2024-07-31 13:55:24.316259: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
### Architecture du modèle

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D

embedding_dim = 300
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(GlobalAveragePooling1D())
model.add(Dense(vocab_size, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X, y, batch_size = 128, epochs=50)

model.save("word2vec.h5") 


Epoch 1/50
[1m12163/12163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m681s[0m 56ms/step - accuracy: 0.0224 - loss: 7.9315
Epoch 2/50
[1m12163/12163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m689s[0m 57ms/step - accuracy: 0.0553 - loss: 7.0194
Epoch 3/50
[1m12163/12163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m669s[0m 55ms/step - accuracy: 0.0752 - loss: 6.5212
Epoch 4/50
[1m12163/12163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m672s[0m 55ms/step - accuracy: 0.0907 - loss: 6.1504
Epoch 5/50
[1m12163/12163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m660s[0m 54ms/step - accuracy: 0.1037 - loss: 5.8573
Epoch 6/50
[1m 8578/12163[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m3:10[0m 53ms/step - accuracy: 0.1181 - loss: 5.5856