In [25]:
# Importation des biblio
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Charger le dataset
df = pd.read_csv('/content/sample_data/movie_review.csv')

#Supprimer les donne indesirable
del df["fold_id"]
del df["html_id"]
del df["sent_id"]
del df["cv_tag"]

df


Unnamed: 0,text,tag
0,films adapted from comic books have had plenty...,pos
1,"for starters , it was created by alan moore ( ...",pos
2,to say moore and campbell thoroughly researche...,pos
3,"the book ( or "" graphic novel , "" if you will ...",pos
4,"in other words , don't dismiss this film becau...",pos
...,...,...
64715,that lack of inspiration can be traced back to...,neg
64716,like too many of the skits on the current inca...,neg
64717,"after watching one of the "" roxbury "" skits on...",neg
64718,"bump unsuspecting women , and . . . that's all .",neg


In [None]:
# Pré-traitement des données textuelles
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text):
    # Supprimer la ponctuation et convertir en minuscules
    text = re.sub(r'[^\w\s]', '', str(text).lower())
    # Supprimer les stop words et appliquer la racinisation (stemming)
    tokens = [ps.stem(word) for word in word_tokenize(text) if word.isalnum() and word not in stop_words]
    return ' '.join(tokens)

#On remplace donc les text prossesser a notre dataset
df['text'] = df['text'].apply(preprocess_text)

df

Unnamed: 0,text,tag
0,film adapt comic book plenti success whether t...,pos
1,starter creat alan moor eddi campbel brought m...,pos
2,say moor campbel thoroughli research subject j...,pos
3,book graphic novel 500 page long includ nearli...,pos
4,word dont dismiss film sourc,pos
...,...,...
64715,lack inspir trace back insipid charact,neg
64716,like mani skit current incarn roxburi guy onej...,neg
64717,watch one roxburi skit snl come away charact b...,neg
64718,bump unsuspect women that,neg


In [None]:
# Entraînement du modèle Word2Vec
sentences = [word_tokenize(text) for text in df['text']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)


In [None]:
# Vectorisation des reviews de movies
def vectorize_text(text, model):
    vectors = [model.wv[word] for word in word_tokenize(text) if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

df['text_vectoriser'] = df['text'].apply(lambda x: vectorize_text(x, word2vec_model))

df

Unnamed: 0,text,tag,text_vectoriser
0,film adapt comic book plenti success whether t...,pos,"[-0.37383977, 0.4303437, 0.3116039, 0.14030515..."
1,starter creat alan moor eddi campbel brought m...,pos,"[-0.40638283, 0.2274493, 0.22095825, 0.0873262..."
2,say moor campbel thoroughli research subject j...,pos,"[-0.2776821, 0.50106883, 0.15677324, -0.005114..."
3,book graphic novel 500 page long includ nearli...,pos,"[-0.49513778, 0.3236461, 0.39235196, 0.2183452..."
4,word dont dismiss film sourc,pos,"[-0.27814537, 0.42095193, 0.3457808, 0.2079162..."
...,...,...,...
64715,lack inspir trace back insipid charact,neg,"[-0.3562996, 0.35413876, 0.33045554, -0.044667..."
64716,like mani skit current incarn roxburi guy onej...,neg,"[-0.22511573, 0.38429466, 0.13517281, -0.02828..."
64717,watch one roxburi skit snl come away charact b...,neg,"[-0.15652063, 0.2576147, 0.14695568, -0.044947..."
64718,bump unsuspect women that,neg,"[-0.216848, 0.2876438, 0.14774294, -0.01186521..."


In [22]:
# Division des données
X_train, X_test, y_train, y_test = train_test_split(
    np.vstack(df['text_vectoriser']), df['tag'], test_size=0.2, random_state=42
)

In [23]:
# Construction du classificateur
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Prédiction sur l'ensemble de test
y_pred = classifier.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
# Évaluation du modèle
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Accuracy: 0.5730
Precision: 0.5736
Recall: 0.5730
F1 Score: 0.5702
