<a href="https://colab.research.google.com/github/ECH-CHADLI/NLP_TP/blob/main/TP3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import numpy as np
import pandas as pd

df = pd.read_csv('/content/sample_data/movie_review.csv')
df.head()

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag
0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",pos
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,pos
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",pos
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",pos


# Pre-processing text data from the dataset

In [7]:
import spacy
import spacy.lang.en.stop_words as STOP_WORDS # module that'll be used for stop words
import string # will use in ponctuation

# Charger le modèle de langue
nlp = spacy.load('en_core_web_sm')

def pre_processing(text):

    # Convertir en miniscule
    text = text.lower()

    # Tokenization
    doc = nlp(text)

    # Supprimer les stopwords et la ponctuation
    tokens = [token.text for token in doc if token.text not in STOP_WORDS.STOP_WORDS and token.text not in string.punctuation]

    return tokens

proc_text = df['text'].apply(pre_processing).values
proc_text

array([list(['films', 'adapted', 'comic', 'books', 'plenty', 'success', 'superheroes', 'batman', 'superman', 'spawn', 'geared', 'kids', 'casper', 'arthouse', 'crowd', 'ghost', 'world', 'comic', 'book', 'like', 'hell']),
       list(['starters', 'created', 'alan', 'moore', 'eddie', 'campbell', 'brought', 'medium', 'new', 'level', 'mid', '80s', '12', 'series', 'called', 'watchmen']),
       list(['moore', 'campbell', 'thoroughly', 'researched', 'subject', 'jack', 'ripper', 'like', 'saying', 'michael', 'jackson', 'starting', 'look', 'little', 'odd']),
       ...,
       list(['watching', 'roxbury', 'skits', 'snl', 'come', 'away', 'characters', 'bob', 'heads', 'love']),
       list(['bump', 'unsuspecting', 'women']),
       list(['watching', 'a_night_at_the_roxbury', 'left', 'exactly'])],
      dtype=object)

# Entrainement du modèle Word2Vec

In [18]:
from gensim.models import Word2Vec

model = Word2Vec(proc_text, vector_size=100, window=5, min_count=1, sg=0) # utiliser l'algorithme CBOW

vector = model.wv['films']
print(vector)
similar = model.wv.most_similar('inspiration')
print(similar)
#model.save("word2vec.model")

array([list(['films', 'adapted', 'comic', 'books', 'plenty', 'success', 'superheroes', 'batman', 'superman', 'spawn', 'geared', 'kids', 'casper', 'arthouse', 'crowd', 'ghost', 'world', 'comic', 'book', 'like', 'hell']),
       list(['starters', 'created', 'alan', 'moore', 'eddie', 'campbell', 'brought', 'medium', 'new', 'level', 'mid', '80s', '12', 'series', 'called', 'watchmen']),
       list(['moore', 'campbell', 'thoroughly', 'researched', 'subject', 'jack', 'ripper', 'like', 'saying', 'michael', 'jackson', 'starting', 'look', 'little', 'odd']),
       ...,
       list(['watching', 'roxbury', 'skits', 'snl', 'come', 'away', 'characters', 'bob', 'heads', 'love']),
       list(['bump', 'unsuspecting', 'women']),
       list(['watching', 'a_night_at_the_roxbury', 'left', 'exactly'])],
      dtype=object)

# Vectorisation des Reviews des films

In [16]:
def vectorize_review(review, model):
    vectors = [model.wv[word] for word in review]
    if vectors:
        return sum(vectors)/len(vectors)
    else:
        return 0


df['vectorized_text'] = [vectorize_review(review, model) for review in proc_text]
#print(df['vectorized_text'])
df.head()

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag,vectorized_text
0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos,"[-0.011386539, 0.25315222, 0.32116765, 0.11077..."
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",pos,"[0.019843714, 0.28355268, 0.39218983, 0.164506..."
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,pos,"[0.011017825, 0.32850763, 0.35978535, 0.091372..."
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",pos,"[-0.07852615, 0.3385619, 0.29541576, 0.0984487..."
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",pos,"[-0.119283, 0.28504375, 0.35058528, 0.12721694..."


# Training & Testing data division

In [57]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Turning the target column from categorical values to numerical ones, in order to word with logistic regression (binary classification(pos, neg))
le = preprocessing.LabelEncoder()
df['tag'] = le.fit_transform(df['tag'])

X = np.array(df['vectorized_text']).reshape(-1, 1)
y = np.array(df['tag'])

#X = df['vectorized_text'].tolist()
#y = df['tag']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # 20% of data for testing
df.head()

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag,vectorized_text
0,0,cv000,29590,0,films adapted from comic books have had plenty...,1,"[-0.011386539, 0.25315222, 0.32116765, 0.11077..."
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",1,"[0.019843714, 0.28355268, 0.39218983, 0.164506..."
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,1,"[0.011017825, 0.32850763, 0.35978535, 0.091372..."
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",1,"[-0.07852615, 0.3385619, 0.29541576, 0.0984487..."
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",1,"[-0.119283, 0.28504375, 0.35058528, 0.12721694..."


# Construction d'un classificateur, dans ce cas on va utiliser la Regression Logistique

In [58]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model = LogisticRegression()

# Entrainement du model
model.fit(X_train, y_train)

# Prediction
pred = model.predict(X_test)
print("prediction d'après le test set: ", pred)

# Evaluation
accuracy = accuracy_score(y_test, pred)
print("Accuracy: ", accuracy)

precision = precision_score(y_test, pred)
print("Pricision: ", precision)

recall = recall_score(y_test, pred)
print("Recall: ", recall)

f1 = f1_score(y_test, pred)
print("F1 score: ", f1)


ValueError: setting an array element with a sequence.