In [1]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import mlflow
import mlflow.sklearn

In [2]:
# Load the data via pandas
import pandas as pd
df = pd.read_csv("IMDB_Dataset.csv")
#df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
print(df.shape)
df.head()

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
# Preprocess text data
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenization and stemming
    words = word_tokenize(text)
    words = [ps.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

df['review'] = df['review'].apply(preprocess_text)

In [4]:
# Convert sentiment to 1 for positive, 0 for negative
label_encoder = LabelEncoder()
df['sentiment'] = label_encoder.fit_transform(df['sentiment'])

In [20]:
subset_size = 5000  # ou toute autre taille que vous souhaitez
df_subset = df.sample(subset_size, random_state=42)

# Utiliser le sous-ensemble pour l'entraînement
X_train, X_test, y_train, y_test = train_test_split(df_subset['review'], df_subset['sentiment'], test_size=0.2, random_state=42)

In [5]:
# Split the dataset
#X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

In [21]:
# Step 2: Text Vectorization
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [14]:
# Séparer le jeu de test et le jeu d'entraînement
X_train = X_test_vectorized[:200]
y_train = y_test[:200]

X_test = X_test_vectorized[200:250]
y_test = y_test[200:250]

In [22]:
# Step 3: Build and Train Machine Learning Models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

mlflow.set_experiment("mlflow-imdb")

def evaluate(sk_model, X_test_vectorized, y_test):
    eval_acc = sk_model.score(X_test_vectorized, y_test)
    preds = sk_model.predict(X_test_vectorized)
    auc_score = roc_auc_score(y_test, preds)
    mlflow.log_metric("eval_acc", eval_acc) # nous avons fait la même chose pour l'accuracy de test (evaluation)
    mlflow.log_metric("auc_score", auc_score)# aussi pour l'AUC score 
    print(f"Auc Score: {auc_score:.3%}")
    print(f"Eval Accuracy: {eval_acc:.3%}")


2023/11/29 10:41:48 INFO mlflow.tracking.fluent: Experiment with name 'mlflow-imdb' does not exist. Creating a new experiment.


In [23]:
sk_model = LogisticRegression(random_state=None)
mlflow.set_experiment("scikit_learn_imdb")
with mlflow.start_run():
    #train(sk_model, x_train, y_train)
    params = {
    "max_iter": 1,
    "penalty" : 'L1',
    "solver": 'liblinear',
    }
    mlflow.log_params(params)
    sk_model = sk_model.fit(X_train_vectorized, y_train)
    train_acc = sk_model.score(X_train_vectorized, y_train)
    mlflow.log_metric("train_acc", train_acc) 
    print(f"Train Accuracy: {train_acc:.3%}")
    evaluate(sk_model, X_test_vectorized, y_test)
    mlflow.sklearn.log_model(sk_model, "log_reg_model")
    print("Model run: ", mlflow.active_run().info.run_uuid)
mlflow.end_run()

Train Accuracy: 93.675%


NameError: name 'roc_auc_score' is not defined

In [None]:
sk_model = SVC(random_state=None)
mlflow.set_experiment("scikit_learn_imdb")
with mlflow.start_run():
    params = {
    "max_iter": 600,
    "C": 1,
    "kernel" : 'poly',
    }
    mlflow.log_params(params)
    sk_model = sk_model.fit(X_train_vectorized, y_train)
    train_acc = sk_model.score(X_train_vectorized, y_train)
    mlflow.log_metric("train_acc", train_acc)
    print(f"Train Accuracy: {train_acc:.3%}")
    evaluate(sk_model, X_test_vectorized, y_test)
    mlflow.sklearn.log_model(sk_model, "svc_model")
    print("Model run: ", mlflow.active_run().info.run_uuid)
mlflow.end_run()

In [None]:
sk_model = RandomForestClassifier(random_state=None)
mlflow.set_experiment("scikit_learn_imdb")
with mlflow.start_run():
    params = {
    'n_estimators': 100,
    }
    mlflow.log_params(params)
    sk_model = sk_model.fit(X_train_vectorized, y_train)
    train_acc = sk_model.score(X_train_vectorized, y_train)
    mlflow.log_metric("train_acc", train_acc)
    print(f"Train Accuracy: {train_acc:.3%}")
    evaluate(sk_model, X_test_vectorized, y_test)
    mlflow.sklearn.log_model(sk_model, "random_model")
    print("Model run: ", mlflow.active_run().info.run_uuid)
mlflow.end_run()

In [None]:
sk_model = MultinomialNB(random_state=None)
mlflow.set_experiment("scikit_learn_imdb")
with mlflow.start_run():
    sk_model = sk_model.fit(X_train_vectorized, y_train)
    train_acc = sk_model.score(X_train_vectorized, y_train)
    mlflow.log_metric("train_acc", train_acc)
    print(f"Train Accuracy: {train_acc:.3%}")
    evaluate(sk_model, X_test_vectorized, y_test)
    mlflow.sklearn.log_model(sk_model, "naives_model")
    print("Model run: ", mlflow.active_run().info.run_uuid)
mlflow.end_run()

In [None]:
sk_model = KNeighborsClassifier(random_state=None)
mlflow.set_experiment("scikit_learn_imdb")
with mlflow.start_run():
    knn_params = {
    "n_neighbors": 5,
    "weights": "uniform",
    "algorithm": "auto",
    }
    mlflow.log_params(params)
    sk_model = sk_model.fit(X_train_vectorized, y_train)
    train_acc = sk_model.score(X_train_vectorized, y_train)
    mlflow.log_metric("train_acc", train_acc)
    print(f"Train Accuracy: {train_acc:.3%}")
    evaluate(sk_model, X_test_vectorized, y_test)
    mlflow.sklearn.log_model(sk_model, "knn_model")
    print("Model run: ", mlflow.active_run().info.run_uuid)
mlflow.end_run()