## Classification de critiques de films

Par CHACHA Ali Ahmed Bachir 

chachaaliahmed0@gmail.com

ESG@2022-2023 BD 5ème année

## 1. Importation des modules

In [1]:
import pickle 
import pandas as pd
import numpy as np                                              
#from textblob import TextBlob  
from sklearn.feature_extraction.text import CountVectorizer   
from sklearn.model_selection import train_test_split           
from sklearn.linear_model import LogisticRegression            
from sklearn.metrics import confusion_matrix                   
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords 
#from wordcloud import WordCloud 
import matplotlib.pyplot as plt 

## 2. chargement des données

In [2]:
# Enregistrement des données dans des variables: pos_data = critiques positives| neg_data = critiques négatives 
    
pos_data = pd.read_pickle("/Users/aliahmedbachirchacha/Documents/tp_machine_learning/imdb_raw_pos.pickle")
neg_data = pd.read_pickle("/Users/aliahmedbachirchacha/Documents/tp_machine_learning/imdb_raw_neg.pickle")
    
print("Le dataset de critiques positives contient " + str(len(pos_data)) + " observations.")
print("Le dataset de critiques négatives contient " + str(len(neg_data)) + " observations.")

Le dataset de critiques positives contient 12500 observations.
Le dataset de critiques négatives contient 12500 observations.


In [3]:
positif_df = pd.DataFrame({"commentaire":pos_data})

In [4]:
negatif_df = pd.DataFrame({"commentaire":neg_data})

In [17]:
positif_df["sentiment"]=1
negatif_df["sentiment"]=0

In [20]:
positif_df.head()

Unnamed: 0,commentaire,sentiment
0,i went and saw this movie last night after bei...,1
1,actor turned director bill paxton follows up h...,1
2,as a recreational golfer with some knowledge o...,1
3,i saw this film in a sneak preview and it is ...,1
4,bill paxton has taken the true story of the ...,1


In [22]:
df_final = pd.concat([positif_df, negatif_df])

In [25]:
df_final

Unnamed: 0,commentaire,sentiment
0,i went and saw this movie last night after bei...,1
1,actor turned director bill paxton follows up h...,1
2,as a recreational golfer with some knowledge o...,1
3,i saw this film in a sneak preview and it is ...,1
4,bill paxton has taken the true story of the ...,1
...,...,...
12495,i occasionally let my kids watch this garbage ...,0
12496,when all we have anymore is pretty much realit...,0
12497,the basic genre is a thriller intercut with an...,0
12498,four things intrigued me as to this film - fir...,0


## 3. Prétraitements des données 

In [None]:
caracteres_speciaux=[',','"','<br />',';','--','---','`','/',"'","]",'[','?',':','(',')','<','>','|','{','}','*','%','_','!',".",'~','1','2','3','4','5','6','7','8','9','0','$']

In [30]:
df_final['commentaire'].replace(caracteres_speciaux, '')

0        i went and saw this movie last night after bei...
1        actor turned director bill paxton follows up h...
2        as a recreational golfer with some knowledge o...
3        i saw this film in a sneak preview  and it is ...
4        bill paxton has taken the true story of the   ...
                               ...                        
12495    i occasionally let my kids watch this garbage ...
12496    when all we have anymore is pretty much realit...
12497    the basic genre is a thriller intercut with an...
12498    four things intrigued me as to this film - fir...
12499    david bryce s comments nearby are exceptionall...
Name: commentaire, Length: 25000, dtype: object

### Split data

In [34]:
X_train, X_test, Y_train, Y_test = train_test_split(df_final.commentaire,df_final.sentiment, test_size= 0.4)

In [35]:
print("X_train size : ", X_train.shape)
print("X_test size : ", X_test.shape)
print("Y_train size : ", Y_train.shape)
print("Y_test size : ", Y_test.shape)

X_train size :  (15000,)
X_test size :  (10000,)
Y_train size :  (15000,)
Y_test size :  (10000,)


In [4]:
Data= np.hstack((pos_data,neg_data))

#nombre de lignes de Data
print("La matrice Data contient donc " + str(Data.shape[0]) + " lignes.")  
print("---------------------")
 #Affichage de la première critique de Data = première critique de pos_data
print(Data[0]) 
print("---------------------")
#Affichage de la dernière critique de Data = dernière critique de neg_data
print(Data[24999])                                                         


La matrice Data contient donc 25000 lignes.
---------------------
i went and saw this movie last night after being coaxed to by a few friends of mine  i ll admit that i was reluctant to see it because from what i knew of ashton kutcher he was only able to do comedy  i was wrong  kutcher played the character of jake fischer very well  and kevin costner played ben randall with such professionalism  the sign of a good movie is that it can toy with our emotions  this one did exactly that  the entire theater  which was sold out  was overcome by laughter during the first half of the movie  and were moved to tears during the second half  while exiting the theater i not only saw many women in tears  but many full grown men as well  trying desperately not to let anyone see them crying  this movie was great  and i suggest that you go see it before you judge 
---------------------
david bryce s comments nearby are exceptionally well written and informative as almost say everything i feel about da

## 4. Réprésentation du corpus à l'aide des sacs de mots 

In [9]:
#stopwords
stop = set(stopwords.words("english"))
#Mise à jour des stopwords car ceux ci ont sensiblement le même poids dans les deux set de critiques.
stop.update(['movie','film','like','one','see','story','time','would','also','people','movies','show','even','good','really','much','make',
                  'watch','first','think','characters','way','films','many','could','seen','made','character','little','get','know','two','well','ever','never','say','end','10','plot','scene',
                  'scenes','great','acting','better','funny','actually','go','life','makes','going','man','actors'])


#Création de la Matrice X avec le module counvectorizer
cv = CountVectorizer(binary=True,stop_words=(stop)) #Stop_words pour supprimer les mots vides 
cv.fit(Data)
X = cv.transform(Data) 

#print(stop)
print("La matrice X créée a donc "+ str(X._shape[0]) + " lignes et " + str(X._shape[1]) + " colonnes.")
print(str(X._shape[1]) + " est donc la taille de notre dictionnaire (Vocabulaire) c'est à dire le nombre de mots uniques trouvés dans les différentes critiques .")



La matrice X créée a donc 25000 lignes et 72334 colonnes.
72334 est donc la taille de notre dictionnaire (Vocabulaire) c'est à dire le nombre de mots uniques trouvés dans les différentes critiques .


In [10]:
#Les différents mots de notre dictionnaire / vocabulaire
#print(cv.get_feature_names()) 
Features_names = cv.get_feature_names()
Features_names_300=[]
for i in range(0,300):
        Features_names_300.append(Features_names[i])

#Affiche Les 300 premiers mots du vocabulaire
print(Features_names_300)

['aa', 'aaa', 'aaaaaaaaaaaahhhhhhhhhhhhhh', 'aaaaaaaargh', 'aaaaaaahhhhhhggg', 'aaaaagh', 'aaaaah', 'aaaaahhhh', 'aaaaargh', 'aaaaarrrrrrgggggghhhhhh', 'aaaaaw', 'aaaahhhhhh', 'aaaawwwwww', 'aaaggghhhhhhh', 'aaagh', 'aaah', 'aaahhhhhhh', 'aaall', 'aaam', 'aaand', 'aaargh', 'aaarrrgh', 'aab', 'aachen', 'aagh', 'aah', 'aahe', 'aahed', 'aahhhh', 'aahing', 'aaila', 'aailiyah', 'aaja', 'aajala', 'aak', 'aake', 'aalcc', 'aaliyah', 'aalox', 'aamir', 'aamr', 'aamto', 'aaoon', 'aap', 'aapke', 'aaran', 'aardman', 'aardvark', 'aardvarks', 'aarf', 'aargh', 'aarika', 'aaron', 'aashok', 'aasmaan', 'aasman', 'aaton', 'aau', 'aawip', 'ab', 'aback', 'abagail', 'abandon', 'abandoned', 'abandoning', 'abandonment', 'abandons', 'abasing', 'abba', 'abbas', 'abbe', 'abbey', 'abbie', 'abbot', 'abbott', 'abbotts', 'abbreviate', 'abbreviated', 'abbreviating', 'abby', 'abbyss', 'abc', 'abcs', 'abd', 'abdalla', 'abderrahmane', 'abdicated', 'abdomen', 'abdominal', 'abdoo', 'abduct', 'abducted', 'abductee', 'abduct