<h1><center>TP3 : NLP Classification de texte</center></h1>

*faite par BAHI Brahim*

In [103]:
import re
import pandas as pd
import gensim
from gensim.models import Word2Vec
from nltk.corpus import stopwords
import nltk
import spacy
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier

In [85]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bahib\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Pre-processing of textual data
___

In [86]:
data = pd.read_csv('movie_review.csv')

# formatting and morphing sentences from the list fo texts into a list of lists of words that are of minimum length of 2, not english stop words and in lowercase.
texts = [[word for word in re.findall(r'\b[a-z][a-z]+\b', sentence.lower()) if word not in stopwords.words('english')] for sentence in data['text'].tolist()]

# encoding pos and neg.
tags = [1 if tag.strip() == 'pos' else 0 for tag in data['tag'].tolist()]

f"{texts[0]} - tag:{tags[0]}"

"['films', 'adapted', 'comic', 'books', 'plenty', 'success', 'whether', 'superheroes', 'batman', 'superman', 'spawn', 'geared', 'toward', 'kids', 'casper', 'arthouse', 'crowd', 'ghost', 'world', 'never', 'really', 'comic', 'book', 'like', 'hell'] - tag:1"

### Training the Word2Vec model
___

In [148]:
w2v = Word2Vec(texts, vector_size=125, window=15, min_count=4, sg=1, workers=2)

In [149]:
len(w2v.wv.index_to_key)

16546

### Vectorizing movie reviews
___

In [150]:
def review_to_vector(review, model):
  # check if the word exists in the model vectors and then add it to the list
  vectors = [model.wv[word] for word in review if word in model.wv.index_to_key]

  if not vectors:
    return np.zeros(model.vector_size)

  return np.mean(vectors, axis=0)

In [151]:
vectorized_texts = [review_to_vector(text, w2v) for text in texts]
vectorized_texts[0]

array([ 0.2139607 , -0.17659612,  0.17477573, -0.10878945, -0.2817444 ,
        0.05977948,  0.08163511,  0.31405854, -0.03398285, -0.12014479,
        0.08496115,  0.10463213, -0.0162975 , -0.01009984, -0.16343477,
       -0.07263597, -0.15547885,  0.11306285, -0.29203606, -0.03101908,
        0.25516757,  0.25758794,  0.09752367, -0.04316832,  0.07217384,
        0.04839683, -0.01595195, -0.14867945, -0.31790262, -0.11132312,
       -0.03740641,  0.19445992, -0.02986876, -0.23629451,  0.2439329 ,
       -0.18396449,  0.16994424, -0.16726835,  0.0626245 , -0.2877777 ,
       -0.06047134,  0.22807921, -0.05109709,  0.11486401,  0.06779773,
        0.02256543,  0.16853921, -0.07504633,  0.165442  , -0.07412568,
       -0.19810633,  0.05067117, -0.27174202, -0.13305773,  0.10415772,
       -0.25226343,  0.10867754, -0.04687229, -0.10220748, -0.13163643,
       -0.09643636, -0.20159763,  0.0160696 ,  0.05357694, -0.080737  ,
        0.3265387 , -0.13526489,  0.05354112,  0.08135698,  0.18

### Splitting the data
___

In [152]:
X_train, X_test, y_train, y_test = train_test_split(vectorized_texts, tags, test_size=0.25, random_state=42)

In [153]:
# classifier = LogisticRegression()
classifier = RandomForestClassifier(n_estimators=200, random_state=42)

In [154]:
classifier.fit(X_train, y_train)

In [155]:
y_pred = classifier.predict(X_test)

### Evaluating the model
___

In [156]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.6157601977750309
Precision: 0.6193408185440058
Recall: 0.6260678545277032
F1 Score: 0.6226861685986527
