## Importation

In [3]:
import numpy as np
import pandas as pd
import joblib, os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.utils import column_or_1d
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

## Recup data

In [5]:
all_data = pd.read_csv("imdb.csv", encoding='latin-1')
print("Dataset chargé — premières lignes :")
display(all_data.head(100))
print("\nColonnes :", all_data.columns.tolist())
print("Nombre de lignes :", len(all_data))

Dataset chargé — premières lignes :


Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt
...,...,...,...,...,...
95,95,test,Tyra Banks needs to teach these girls that it'...,neg,10086_1.txt
96,96,test,"This is by far the most vapid, idiotic, insane...",neg,10087_1.txt
97,97,test,It was awful plain and simple. What was their ...,neg,10088_1.txt
98,98,test,Wow! i think they made this movie to torture p...,neg,10089_1.txt



Colonnes : ['Unnamed: 0', 'type', 'review', 'label', 'file']
Nombre de lignes : 100000


In [6]:
all_data = all_data.dropna()
all_data = all_data.loc[all_data['label'] != 'unsup']
all_data = all_data.drop_duplicates(subset='review', keep='last')
# all_data = all_data.drop(all_data['label'] == 'unsup')
X = all_data['review']
y = all_data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

y_train = np.ravel(y_train)
y_test = np.ravel(y_test)
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)


y_train = column_or_1d(y_train, warn=False)
y_test = column_or_1d(y_test, warn=False)

## Fonctions de Preprocessing

In [7]:
def fetch_stopwords():
    filepath = "My_stopwords.txt"
    all_stopwords = set(stopwords.words('english'))
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            raw_data = f.read()
        data_fetch = raw_data.split(",")
        mine_stopwords = [word.strip().strip('"').strip("'").lower() for word in data_fetch if word.strip()]
        all_stopwords.update(mine_stopwords)
        return all_stopwords
    except FileNotFoundError:
        print("Le fichier My_stopwords.txt est introuvable. Les stopwords par défaut seront utilisés.")
        return all_stopwords
    except Exception as e:
        print(f"Une erreur est survenue : {e}")
        return all_stopwords

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def preprocessing(line):
    tokens = []
    tokens.extend(word_tokenize(line))
    all_stopwords = fetch_stopwords()
    W_tempor = [w for w in tokens if w.isalpha() and len(w) > 1]
    clean_tokens = [w.lower() for w in W_tempor if w.lower() not in all_stopwords]

    # Lemmatisation
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in clean_tokens]

    lemmatized_words = [w for w in lemmatized_words if w not in all_stopwords]

    return lemmatized_words

## Preprocessing

In [8]:
cpt = 0

for line in X_train:
    X_train[cpt] = preprocessing(line)
    cpt += 1

In [9]:
cpt1 = 0

for line in X_test:
    X_test[cpt1] = preprocessing(line)
    cpt1 += 1

## Test des models disponibles

In [None]:
# vectorizer = TfidfVectorizer(stop_words="english")
# X_train_vec = vectorizer.fit_transform(X_train)
# X_test_vec  = vectorizer.transform(X_test)
#
# models = {
#     "RandomForest": RandomForestClassifier(),
#     "SVM": SVC(),
#     "KNN": KNeighborsClassifier(),
#     "MLP": MLPClassifier(),
# }
#
# results = {}
#
# for name, model in models.items():
#     pipe = Pipeline([
#         ("model", model)
#     ])
#
#     pipe.fit(X_train_vec, y_train)
#     y_pred = pipe.predict(X_test_vec)
#     results[name] = accuracy_score(y_test, y_pred)
#
# print(results)

In [None]:
# y_train[y_train=="neg"] = 0
# y_train[y_train=="pos"] = 1
#
# y_test[y_test=="neg"] = 0
# y_test[y_test=="pos"] = 1
#
# y_train = y_train.astype(int)
# y_test = y_test.astype(int)
#
# model = Pipeline([
#     ("tfidf", TfidfVectorizer()),
#     ("clf", LogisticRegression(max_iter=1000))
# ])
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
#
#
# print("Accuracy :", accuracy_score(y_test, y_pred))
# print("\nClassification report :")
# print(classification_report(y_test, y_pred))

## Entraînement du Model Final

{'RandomForest': 0.8520336134453782, 'SVM': 0.8989579831932774, 'KNN': 0.7719663865546218, 'MLP': 0.8783193277310924, 'LogisticRegression' : 0.890890756302521}

In [10]:
X_train = X_train.apply(lambda mots: " ".join(mots))
X_test = X_test.apply(lambda mots: " ".join(mots))
X_test

0        lead actress strikingly beautiful plot stand c...
1        view movie time nuance perception life ordinar...
2        otaku day robotech gunbuster favorite anime ti...
3        today thought people include heaven gate maste...
4        basic idea movie good real character developme...
                               ...                        
14870    rks success ghayal start film comedy film year...
14871    boring film main cast member click giovanni ri...
14872    kate beckinsale good gwyneth paltrow emma movi...
14873    angst imdb reviewer hate film masterpiece view...
14874    start watch year friend time young enjoy joke ...
Name: review, Length: 14875, dtype: object

In [11]:
pipe = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("svc", SVC())
])

print("Début de l'entraînement... (peut prendre quelques minutes)")
pipe.fit(X_train, y_train)
print("Entraînement terminé.")


y_pred = pipe.predict(X_test)
print("Accuracy :", accuracy_score(y_test, y_pred))
print("\nClassification report :")
print(classification_report(y_test, y_pred))

Début de l'entraînement... (peut prendre quelques minutes)
Entraînement terminé.
Accuracy : 0.8829579831932773

Classification report :
              precision    recall  f1-score   support

         neg       0.90      0.87      0.88      7500
         pos       0.87      0.90      0.88      7375

    accuracy                           0.88     14875
   macro avg       0.88      0.88      0.88     14875
weighted avg       0.88      0.88      0.88     14875



## Enregistrement du Model

In [15]:
import pickle
model_path = "model_final.pkl"

pickle.dump(pipe, open("model_final.pkl","wb"))
print(f"Modèle sauvegardé dans: {model_path}")

Modèle sauvegardé dans: model_final.pkl
