In [8]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import mlflow
import mlflow.sklearn

In [2]:
# Load the data via pandas
import pandas as pd
df = pd.read_csv("IMDB_Dataset.csv")
#df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
print(df.shape)
df.head()

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [18]:
a=23
type(a)

int

In [3]:
# Preprocess text data
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters and punctuation
    text = ''.join([char for char in text if char not in string.punctuation])

    # Tokenization
    tokens = text.split()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

df['review'] = df['review'].apply(preprocess_text)

In [17]:
df['review'][0]

'one review mention watch oz episod hook right exactli happen first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use word call oz nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home mani aryan muslim gangsta latino christian italian irish scuffl death stare dodgi deal shadi agreement never far away would say main appeal show due fact goe show dare forget pretti pictur paint mainstream audienc forget charm forget romanc oz mess around first episod ever saw struck nasti surreal say readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard sold nickel inmat kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may becom comfort uncomfort view that get touch darker side'

In [4]:
# Convert sentiment to 1 for positive, 0 for negative
label_encoder = LabelEncoder()
df['sentiment'] = label_encoder.fit_transform(df['sentiment'])

In [5]:
subset_size = 5000  # ou toute autre taille que vous souhaitez
df_subset = df.sample(subset_size, random_state=42)

# Utiliser le sous-ensemble pour l'entraînement
X_train, X_test, y_train, y_test = train_test_split(df_subset['review'], df_subset['sentiment'], test_size=0.2, random_state=42)

In [6]:
# Step 2: Text Vectorization
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [10]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier(),
    # Add more models as needed
}

mlflow.set_experiment("scikit_learn_imdb")

# Train and evaluate models
results = {}
best_model = None
best_accuracy = 0

for name, model in models.items():
    with mlflow.start_run(run_name=name):
        model.fit(X_train_vectorized, y_train)
        y_pred = model.predict(X_test_vectorized)
        accuracy = accuracy_score(y_test, y_pred)
        classification_rep = classification_report(y_test, y_pred)

        # Log parameters and metrics to MLflow
        mlflow.log_params({'model_type': type(model).__name__})
        mlflow.log_metrics({'accuracy': accuracy})
        
        # Save the model as an MLflow artifact
        mlflow.sklearn.log_model(model, f"{name}_model")

        results[name] = {
            'model': model,
            'accuracy': accuracy,
            'classification_report': classification_rep
        }

        # Track the best model
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model

# Print the best model and its accuracy
print(f"Best Model: {type(best_model).__name__}")
print(f"Best Model Accuracy: {best_accuracy}")

# End the MLflow run
mlflow.end_run()

Best Model: LogisticRegression
Best Model Accuracy: 0.854


In [11]:
log_reg_model = mlflow.sklearn.load_model("runs:/ebcad27735fb47aca144d631a25d7454/Logistic Regression_model")

In [12]:
# Convert into SVC model into ONNX format file
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
initial_type = [('float_input', FloatTensorType([None, X_train_vectorized.shape[1]]))]
onx = convert_sklearn(log_reg_model, initial_types=initial_type)
with open("outputs/log_reg_model.onnx", "wb") as f:
    f.write(onx.SerializeToString())

In [13]:
import pickle

with open('./outputs/vectorizer.pkl', 'wb') as vectorizer_pkl:
    pickle.dump(vectorizer, vectorizer_pkl)