<h1> GI5 <center> NLP Classification de texte </center><br> By Ammari Youssef

## <font color=red> Chargement du données

In [1]:
import pandas as pd 

In [2]:
df = pd.read_csv("movie_review.csv")
df.head()

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag
0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",pos
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,pos
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",pos
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",pos


## <font color=red> Preprocessing de données textuelles (toLowercase , zero stopwords/punctuation ..etc)

In [3]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def preprocess_text(text):

    # Tokenization and lowercasing
    tokens = word_tokenize(text.lower())

    # Remove stopwords and punctuation
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]

    return tokens


# Apply pre-processing to the text column
df['processed_text'] = df['text'].apply(preprocess_text)


In [4]:
df['processed_text'].head()

0    [films, adapted, comic, books, plenty, success...
1    [starters, created, alan, moore, eddie, campbe...
2    [say, moore, campbell, thoroughly, researched,...
3    [book, graphic, novel, 500, pages, long, inclu...
4                       [words, dismiss, film, source]
Name: processed_text, dtype: object

## <font color=red> Entrainement du modèle Word2Vec

In [5]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Train Word2Vec model
model = Word2Vec(sentences=df['processed_text'],
                 vector_size=100, window=5, min_count=1, sg=0)


## <font color=red> Vectorisation des reviews de movies :

In [6]:
import numpy as np
# Vectorize reviews using Word2Vec embeddings
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    nwords = 0.

    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.wv[word])

    if nwords:
        feature_vector = np.divide(feature_vector, nwords)

    return feature_vector

# Création d'un vocabulaire
vocabulary = set(model.wv.index_to_key)

# Vectorisation des reviews
df['vector'] = df['text'].apply(lambda x: average_word_vectors(word_tokenize(x), model, vocabulary, 100))


In [7]:
df['vector'][0]

array([-0.19379095,  0.50768781,  0.29603772,  0.15621947,  0.03552601,
       -0.92494812,  0.196914  ,  0.97582599, -0.27228703, -0.61890008,
       -0.07800748, -0.65189194, -0.12398562,  0.308089  ,  0.34736456,
       -0.30957173, -0.00586812, -0.40612961, -0.02913243, -1.04193045,
        0.32267498,  0.07459503,  0.23368766, -0.14311619, -0.03261911,
        0.04751716, -0.26170728, -0.14647747, -0.47522261,  0.07631433,
        0.19639288, -0.03694258,  0.03229123, -0.60513341, -0.0257286 ,
        0.39964624,  0.22196333, -0.36366683, -0.42888313, -0.64676978,
        0.22336517, -0.43486335, -0.14518735,  0.14022401,  0.5016803 ,
       -0.28679916, -0.44283414, -0.00607692,  0.19653016,  0.2459543 ,
        0.29888651, -0.46299286, -0.48669617, -0.09071458, -0.32579215,
        0.07468978,  0.35180817, -0.15028563, -0.2748481 ,  0.11233867,
        0.20207276,  0.24658376, -0.1355174 , -0.0211116 , -0.4204115 ,
        0.37753133,  0.18743955,  0.47467294, -0.59675974,  0.48

## <font color=red> Division des données :

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


# Encodage de la variable cible (tag)
label_encoder = LabelEncoder()
df['tag_encoded'] = label_encoder.fit_transform(df['tag'])

# Division des données
X_train, X_test, y_train, y_test = train_test_split(np.vstack(df['vector']), df['tag_encoded'], test_size=0.2, random_state=42)


In [9]:
print(X_train.shape)
print(len(y_train))


(51776, 100)
51776


## <font color=red> Construction d'un modéle classificateur :

In [10]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Évaluation du modèle
y_pred = classifier.predict(X_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## <font color=red> Evaluation du modéle

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.5671353522867738
Precision: 0.5682989766460126
Recall: 0.5671353522867738
F1 Score: 0.5625915815122414
