# Notebook to work through the walkthrough of using MLflow with Dagshub



# | default_exp core

In [None]:
from datetime import datetime
from joblib import  load
import matplotlib.pyplot as plt
import mlflow
from mlflow.models.signature import infer_signature

# import numpy as np
import pandas as pd
from sklearn.base import TransformerMixin
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfTransformer,
    TfidfVectorizer,
)
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import spacy
import en_core_web_sm
from spacy.lang.en.stop_words import STOP_WORDS

# from spacy.lemmatizer import Lemmatizer
from tqdm import tqdm
from typing import Any

In [None]:
#@markdown Enter the username of your DAGsHub account:
DAGSHUB_USER_NAME = "AaronWChen" #@param {type:"string"}

#@markdown Enter the email for your DAGsHub account:
DAGSHUB_EMAIL = "awc33@cornell.edu" #@param {type:"string"}


In [None]:
DAGSHUB_REPO_NAME="MeaLeon"
BRANCH="venv4/add-try-mlflow"

## Try new DagsHub Library way

In [None]:
import dagshub

dagshub.init(repo_name='MeaLeon', repo_owner='AaronWChen')

## Generate an Access Token for improved account security

In [None]:
# import requests
# import getpass
# import datetime

# r = requests.post('https://dagshub.com/api/v1/user/tokens', 
#                   json={"name": f"colab-token-{datetime.datetime.now()}"}, 
#                   auth=(DAGSHUB_USER_NAME, getpass.getpass('DAGsHub password:')))
# r.raise_for_status()
# DAGSHUB_TOKEN=r.json()['sha1']

Skipping the walkthrough's CLI based cloning of the repo, installing and configuring DVC, and installing MLflow since we have a different repo and the venv already had things set up.

## Log Experiments Locally

### Import MLflow

### Create an Experiment & Get the Experiment ID

In [None]:
def get_experiment_id(name):
    exp = mlflow.get_experiment_by_name(name)
    if exp is None:
      exp_id = mlflow.create_experiment(name)
      return exp_id
    return exp.experiment_id


In [None]:
print(get_experiment_id("initial_run"))

0


In [None]:
raw_data_path = '../data/recipes-en-201706/epicurious-recipes_m2.json'

joblib_basepath = '../joblib/2022.08.23/'

cv_path = joblib_basepath + 'countvec.joblib'
tfidf_path = joblib_basepath + 'tfidf.joblib'
full_df_path = joblib_basepath + 'recipes_with_cv.joblib'
reduced_df_path = joblib_basepath + 'reduced_df.joblib'

In [None]:
reduced_df = load(reduced_df_path)
reduced_df.set_index('id', inplace=True, drop=True)

In [None]:
reduced_df.head(20)

Unnamed: 0_level_0,cuisine_name,achiote,acid,addition,adobo,adobo adobo,adobo adobo sauce,adobo sauce,adobo sauce chipotle,african,...,zest pith,zest vegetable,zinfandel,ziti,zucchini,zucchini blossom,zucchini crookneck,zucchini squash,árbol,árbol pepper
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
54a408a019925f464b3733bc,Italian,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54a408a26529d92b2c003631,Kosher,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54a408a66529d92b2c003638,Kosher,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54a408a719925f464b3733cc,Kosher,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54a408c36529d92b2c0036a1,French,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54a408d319925f464b373474,Italian,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54a408da6529d92b2c0036f9,Kosher,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54a408df19925f464b3734a2,Asian,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54a408e119925f464b3734a4,Cajun/Creole,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54a408e219925f464b3734a9,Italian,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
reduced_df['cuisine_name'].value_counts()

American            5484
Italian             2117
French              1293
Asian               1270
Mediterranean        914
Mexican              809
Indian               355
Kosher               353
Middle Eastern       289
English              225
Caribbean            189
Eastern European     167
Latin American       153
Southwestern         148
Cajun/Creole         144
Scandinavian         131
Thai                 120
Irish                120
Moroccan             117
Chinese              115
African               92
Japanese              91
German                79
Vietnamese            57
Name: cuisine_name, dtype: int64

In [None]:
reduced_df.columns

Index(['cuisine_name', 'achiote', 'acid', 'addition', 'adobo', 'adobo adobo',
       'adobo adobo sauce', 'adobo sauce', 'adobo sauce chipotle', 'african',
       ...
       'zest pith', 'zest vegetable', 'zinfandel', 'ziti', 'zucchini',
       'zucchini blossom', 'zucchini crookneck', 'zucchini squash', 'árbol',
       'árbol pepper'],
      dtype='object', length=3352)

In [None]:
y = reduced_df['cuisine_name']
X = reduced_df.drop(['cuisine_name'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=240, stratify=y, test_size=0.2)

In [None]:
with mlflow.start_run(experiment_id=0):

    params = {
        "class_weight":'balanced',
        "verbose":20,
        "solver":'saga',
        "multi_class":'multinomial',
        "n_jobs": -1
        }

    lr = LogisticRegression(**params)

    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_train)

    # signature = infer_signature(X_train, lr.predict(X_train))

    lr_score = lr.score(X_test, y_test)

    mlflow.log_params(params
        # {
        # "class_weight":'balanced',
        # "verbose":20,
        # "solver":'saga',
        # "multi_class":'multinomial',
        # "n_jobs": -1
        # }
    )

    mlflow.sklearn.log_model(lr, 'models/logreg_model')

    mlflow.log_metric('mean_accuracy', lr_score)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.


Epoch 1, change: 1.00000000
Epoch 2, change: 0.57113899
Epoch 3, change: 0.77340176
Epoch 4, change: 0.75861417
Epoch 5, change: 0.43178172
Epoch 6, change: 0.38688762
Epoch 7, change: 0.64300645
Epoch 8, change: 0.57345980
Epoch 9, change: 0.33206806
Epoch 10, change: 0.55980348
Epoch 11, change: 0.58173405
Epoch 12, change: 0.45057511
Epoch 13, change: 0.37541123
Epoch 14, change: 0.40159309
Epoch 15, change: 0.52843962
Epoch 16, change: 0.44318134
Epoch 17, change: 0.66858720
Epoch 18, change: 0.56290869
Epoch 19, change: 1.48527475
Epoch 20, change: 0.75288371
Epoch 21, change: 0.75250879
Epoch 22, change: 0.59257271
Epoch 23, change: 0.55485825
Epoch 24, change: 0.62505535
Epoch 25, change: 0.47327463
Epoch 26, change: 0.59471553
Epoch 27, change: 0.56179964
Epoch 28, change: 0.64557897
Epoch 29, change: 0.90755928
Epoch 30, change: 0.64747902
Epoch 31, change: 0.56487253
Epoch 32, change: 0.47088424
Epoch 33, change: 0.71033100
Epoch 34, change: 0.47917072
Epoch 35, change: 0.491

[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  5.1min finished


In [None]:
# | hide
from nbdev.showdoc import *

# | export
def foo():
    pass

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()