In [None]:
# ref: https://www.kaggle.com/code/granjithkumar/nlp-with-women-clothing-reviews/data

import numpy as np
import pandas as pd
import os
import re
# nltk.download('stopwords')
os.environ["NLTK_DATA"] = "./corpora"
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer as CV
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
import pickle
import mlflow

: 

In [2]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("customer-sentiment-analysis")

MlflowException: Detected out-of-date database schema (found version cc1f77228345, but expected bd07f7e963c5). Take a backup of your database, then run 'mlflow db upgrade <database_uri>' to migrate your database to the latest schema. NOTE: schema migration may result in database downtime - please consult your database's documentation for more detail.

In [4]:
## data loading
data = pd.read_csv('data/Womens Clothing E-Commerce Reviews.csv',index_col =[0])

## preprocess text
data = data[~data['Review Text'].isnull()]  #Dropping columns which don't have any review
X = data[['Review Text']]
X.index = np.arange(len(X))

y = data['Recommended IND']

In [6]:
corpus =[]
for i in range(len(X)):
    review = re.sub('[^a-zA-z]',' ',X['Review Text'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review =[ps.stem(i) for i in review if not i in set(stopwords.words('english'))]
    review =' '.join(review)
    corpus.append(review)


In [5]:
with open('data/corpus_y.pickle', 'rb') as handle:
    corpus, y = pickle.load(handle)

In [7]:
cv  = CV(max_features = 3000,ngram_range=(1,1))
X_cv = cv.fit_transform(corpus).toarray()
# y = y.values

X_train, X_test, y_train, y_test = train_test_split(X_cv, y, test_size = 0.20, random_state = 0)


In [8]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((18112, 3000), (4529, 3000), (18112,), (4529,))

In [12]:
np.zeros((18112,)).shape

(18112,)

In [23]:
mlflow.sklearn.autolog()

with mlflow.start_run():

    mlflow.set_tag("developer", "Isaac")
    mlflow.set_tag("algorithm", "BernoulliNB")
    mlflow.log_param("train-data", "Womens Clothing E-Commerce Reviews")
    
    alpha = .5
    mlflow.log_param("alpha", alpha)

    classifier = BernoulliNB(alpha = alpha)
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    
    mlflow.log_metric("accuracy", acc)

    print("accuracy on test data:", acc)

    model_name = "model_bow.bin"
    with open("models/" + model_name, 'wb') as fout:
        pickle.dump((cv, classifier), fout)

    # mlflow.sklearn.log_model(classifier, artifact_path="models")
    mlflow.log_artifact(local_path="models/" + model_name, artifact_path="models_pickle")




accuracy on test data: 0.873923603444469


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer as TV
from sklearn.naive_bayes import MultinomialNB

y = data['Recommended IND']

tv  = TV(ngram_range =(1,1),max_features = 3000)
X_tv = tv.fit_transform(corpus).toarray()

X_train, X_test, y_train, y_test = train_test_split(X_tv, y, test_size = 0.20, random_state = 0)


mlflow.sklearn.autolog()

with mlflow.start_run():

    mlflow.set_tag("developer", "Isaac")
    mlflow.set_tag("algorithm", "MultinomialNB")
    mlflow.log_param("train-data", "Womens Clothing E-Commerce Reviews")

    alpha = .5
    mlflow.log_param("alpha", alpha)

    classifier = MultinomialNB(alpha = alpha)
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    mlflow.log_metric("accuracy", acc)

    print("accuracy on test data:", acc)

    model_name = "model_tfidf.bin"
    with open("models/" + model_name, 'wb') as fout:
        pickle.dump((tv, classifier), fout)

    # mlflow.sklearn.log_model(classifier, artifact_path="models")
    mlflow.log_artifact(local_path="models/" + model_name, artifact_path="models_pickle")




accuracy on test data: 0.8441156988297638


In [38]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

y = data['Recommended IND']

## tokenization and dataset creation
tokenizer = Tokenizer(num_words = 3000)
tokenizer.fit_on_texts(corpus)

sequences = tokenizer.texts_to_sequences(corpus)
padded = pad_sequences(sequences, padding='post')

X_train, X_test, y_train, y_test = train_test_split(padded, y, test_size = 0.20, random_state = 0)

# mlflow.tensorflow.autolog()
# mlflow.keras.autolog()

with mlflow.start_run():
    ## model definition
    embedding_dim = 32
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(3000, embedding_dim),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(6, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    ## training
    num_epochs = 50
    batch_size = 32
    callback = tf.keras.callbacks.EarlyStopping(
        monitor="val_loss",
        min_delta=0,
        patience=2,
        verbose=0,
        mode="auto",
        baseline=None,
        restore_best_weights=False,
    )

    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

    mlflow.set_tag("developer", "Isaac")
    mlflow.set_tag("algorithm", "Deep Learning")
    mlflow.log_param("train-data", "Womens Clothing E-Commerce Reviews")
    mlflow.log_param("embedding-dim", embedding_dim)

    print("Fit model on training data")
    history = model.fit(
        X_train,
        y_train,
        batch_size=batch_size,
        epochs=num_epochs,
        callbacks=callback,
        # We pass some validation for
        # monitoring validation loss and metrics
        # at the end of each epoch
        validation_data=(X_test, y_test),
    )

    ## save model and tokenizer
    # model.save('models/model_dl.h5')
    mlflow.keras.log_model(model, 'models/model_dl')

    with open('models/tf_tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # Evaluate the model on the test data using `evaluate`
    print("Evaluate on test data")
    results = model.evaluate(X_test, y_test, batch_size=128)
    print("test loss, test acc:", results)
    mlflow.log_metric("loss", results[0])
    mlflow.log_metric("accuracy", results[1])

    # mlflow.log_artifact(local_path="models/model_dl.h5", artifact_path="models_pickle")
    mlflow.log_artifact(local_path="models/tf_tokenizer.pickle", artifact_path="tokenizer_pickle")


    # Generate predictions (probabilities -- the output of the last layer)
    # on new data using `predict`
    print("Generate predictions for 3 samples")
    predictions = model.predict(X_test[:3])
    print("predictions shape:", predictions.shape)


    sample_string = "I Will tell my friends for sure"
    sample = tokenizer.texts_to_sequences(sample_string)
    padded_sample = pad_sequences(sample, padding='post').T
    sample_predict = model.predict(padded_sample)
    print(f"model prediction for input: {sample_string} \n {sample_predict}")


Fit model on training data
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50




INFO:tensorflow:Assets written to: C:\Users\Isaac\AppData\Local\Temp\tmp0a3f3exe\model\data\model\assets
Evaluate on test data
test loss, test acc: [0.27883851528167725, 0.8805475831031799]
Generate predictions for 3 samples
predictions shape: (3, 1)
model prediction for input: I Will tell my friends for sure 
 [[0.9991835]]


In [41]:
mlflow.keras.load_model('./mlruns/2/b7c1ae8139b94ad9aa5b566001fb4255/artifacts/models/model_dl')

<tensorflow.python.keras.engine.sequential.Sequential at 0x24975a0c0d0>

In [9]:
m = mlflow.keras.load_model("models:/customer-sentiment-analysis/3", dst_path=None)
m

<tensorflow.python.keras.engine.sequential.Sequential at 0x20dee6e98b0>

In [10]:
from mlflow.tracking import MlflowClient
client = MlflowClient(tracking_uri="sqlite:///mlflow.db")
client.download_artifacts(run_id="d3ebd0c0b590443e824cde73fe041a6e", path='models_pickle', dst_path='.')

'c:\\Users\\Isaac\\codes\\personal\\mlops-zoomcamp-final-project\\training_job\\models_pickle'

In [19]:
from mlflow.tracking import MlflowClient
from pprint import pprint

client = MlflowClient(tracking_uri="sqlite:///mlflow.db")
# for rm in client.list_registered_models():
#     pprint(dict(rm), indent=4)

for mv in client.search_model_versions("name='customer-sentiment-analysis'"):
    if dict(mv)['current_stage'] == 'Production':
        pprint(dict(mv), indent=4)
        pprint(dict(mv)['run_id'])
    

{   'creation_timestamp': 1660141156340,
    'current_stage': 'Production',
    'description': 'moved to production on the 8th of August 2020 by Isaac.',
    'last_updated_timestamp': 1660143559490,
    'name': 'customer-sentiment-analysis',
    'run_id': 'd3ebd0c0b590443e824cde73fe041a6e',
    'run_link': '',
    'source': './mlruns/2/d3ebd0c0b590443e824cde73fe041a6e/artifacts/models/model_dl.h5',
    'status': 'READY',
    'status_message': None,
    'tags': {},
    'user_id': None,
    'version': 3}
'd3ebd0c0b590443e824cde73fe041a6e'
