In [1]:
import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv", sep="\t", header=None, names=["label", "message"])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

#vec = TfidfVectorizer()
vec = CountVectorizer()
X_train_vec = vec.fit_transform(X_train)
X_test_vec = vec.transform(X_test)

#model = LogisticRegression(max_iter=1000)
model = MultinomialNB()
model.fit(X_train_vec, y_train)

preds = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, preds))

Accuracy: 0.9919282511210762


In [17]:
import mlflow
import mlflow.sklearn
import numpy as np
from sklearn.metrics import accuracy_score
from mlflow.models.signature import infer_signature

# Start tracking
with mlflow.start_run():

    # Params
    #vectorizer = TfidfVectorizer()
    vectorizer = CountVectorizer()
    #model = LogisticRegression(max_iter=1000)
    model = MultinomialNB()
    
    # Data
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Train
    model.fit(X_train_vec, y_train)
    preds = model.predict(X_test_vec)
    
    # 4️⃣ Evaluate performance
    acc = accuracy_score(y_test, preds)

    # 5️⃣ Log model parameters and metrics
    mlflow.log_param("vectorizer", "CountVectorizer")
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("model", "MultinomialNB")
    mlflow.log_metric("accuracy", acc)

    # 6️⃣ Convert sparse input to dense array for logging
    input_example = X_test_vec[0].toarray()  # one sample
    signature = infer_signature(X_test_vec.toarray(), preds)

    # 7️⃣ Log the model with input_example and signature
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",  # still works for now
        input_example=input_example,
        signature=signature
    )

    print("Accuracy:", acc)




Accuracy: 0.9919282511210762
