<a href="https://colab.research.google.com/github/DesignToWebsite/NLP-classique/blob/main/tp3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [53]:
import pandas as pd
import numpy as np
import nltk
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [54]:
# dataset
data = pd.read_csv("movie_review.csv")

In [55]:
data.head()

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag
0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",pos
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,pos
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",pos
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",pos


In [56]:
# définir les stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [57]:
# Fct de prétraitement du texte
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Suppression des stopwords et la ponctuation, convertir en minuscules
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words and token.isalpha()]
    return tokens


In [58]:
data['text'].head()

0    films adapted from comic books have had plenty...
1    for starters , it was created by alan moore ( ...
2    to say moore and campbell thoroughly researche...
3    the book ( or " graphic novel , " if you will ...
4    in other words , don't dismiss this film becau...
Name: text, dtype: object

In [59]:
# le prétraitement sur la colonne de texte
data['preprocessed_text'] = data['text'].apply(preprocess_text)

In [60]:
data['preprocessed_text'].head()

0    [films, adapted, comic, books, plenty, success...
1    [starters, created, alan, moore, eddie, campbe...
2    [say, moore, campbell, thoroughly, researched,...
3    [book, graphic, novel, pages, long, includes, ...
4                       [words, dismiss, film, source]
Name: preprocessed_text, dtype: object

In [61]:
# Entraînement du modèle Word2Vec
model = Word2Vec(sentences=data['preprocessed_text'], vector_size=100, window=5, min_count=1, workers=4)

In [62]:
# calculer la moyenne des embeddings Word2Vec pour chaque review
def get_review_vector(review):
    vectors = []
    for word in review:
        if word in model.wv:
            vectors.append(model.wv[word])
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(100)

In [63]:
data.head()

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag,preprocessed_text
0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos,"[films, adapted, comic, books, plenty, success..."
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",pos,"[starters, created, alan, moore, eddie, campbe..."
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,pos,"[say, moore, campbell, thoroughly, researched,..."
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",pos,"[book, graphic, novel, pages, long, includes, ..."
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",pos,"[words, dismiss, film, source]"


In [64]:
# obtenir les vecteurs de review
data['review_vector'] = data['preprocessed_text'].apply(get_review_vector)

In [65]:
data.head()

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag,preprocessed_text,review_vector
0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos,"[films, adapted, comic, books, plenty, success...","[-0.2311913, 0.321317, 0.33065805, -0.05810570..."
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",pos,"[starters, created, alan, moore, eddie, campbe...","[-0.1624283, 0.2805944, 0.23107629, -0.2027535..."
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,pos,"[say, moore, campbell, thoroughly, researched,...","[-0.29502234, 0.32080394, 0.3769225, -0.083074..."
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",pos,"[book, graphic, novel, pages, long, includes, ...","[-0.21432027, 0.37205285, 0.1988842, -0.018311..."
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",pos,"[words, dismiss, film, source]","[-0.2351292, 0.38167977, 0.24175577, 0.1049006..."


In [66]:
# Division du datasets
X_train, X_test, y_train, y_test = train_test_split(data['review_vector'].tolist(), data['tag'], test_size=0.2, random_state=42)


In [67]:
# Initialisation + entraînement
classifier = LogisticRegression()
classifier.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [68]:
#prédictions
y_pred = classifier.predict(X_test)


In [69]:
#evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='pos')
recall = recall_score(y_test, y_pred, pos_label='pos')
f1 = f1_score(y_test, y_pred, pos_label='pos')

In [70]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1)

Accuracy: 0.5635815822002472
Precision: 0.5592915811088296
Recall: 0.6630153658907653
F1 score: 0.606752523494605
