Making changes to experiment notebook based on changes to tutorial document (#249)

jotaylo · web-flow · commit 319cae664381 · 2020-04-02T09:28:37.000-07:00
* making changes to notebook based on changes to tutorial document

* test update to assert_almost_equal

* reference repo details in byoc guide
diff --git a/bootstrap/README.md b/bootstrap/README.md
@@ -4,7 +4,7 @@ To use this existing project structure and scripts for your new ML project, you
 
 Bootstrapping will prepare a directory structure for your project which includes:
 
-* renaming files and folders from the base project name `diabetes` to your project name
+* renaming files and folders from the base project name `diabetes_regression` to your project name
 * fixing imports and absolute path based on your project name
 * deleting and cleaning up some directories
 
diff --git a/diabetes_regression/training/test_train.py b/diabetes_regression/training/test_train.py
@@ -10,7 +10,7 @@ def test_train_model():
     reg_model = train_model(data, {"alpha": 1.2})
 
     preds = reg_model.predict([[1], [2]])
-    np.testing.assert_equal(preds, [9.93939393939394, 9.03030303030303])
+    np.testing.assert_almost_equal(preds, [9.93939393939394, 9.03030303030303])
 
 
 def test_get_model_metrics():
diff --git a/docs/custom_model.md b/docs/custom_model.md
@@ -2,8 +2,8 @@
 
 This document provides steps to follow when using this repository as a template to train models and deploy the models with real-time inference in Azure ML with your own scripts and data.
 
-1. Follow the MLOpsPython [Getting Started](https://github.com/microsoft/MLOpsPython/blob/master/docs/getting_started.md) guide
-1. Follow the MLOpsPython [bootstrap instructions](https://github.com/microsoft/MLOpsPython/blob/master/bootstrap/README.md) to create your project starting point
+1. Follow the MLOpsPython [Getting Started](getting_started.md) guide
+1. Follow the MLOpsPython [bootstrap instructions](../bootstrap/README.md) to create your project starting point
 1. Configure training data
 1. [If necessary] Convert your ML experimental code into production ready code
 1. Replace the training code
@@ -13,11 +13,13 @@ This document provides steps to follow when using this repository as a template
 
 ## Follow the Getting Started guide
 
-Follow the [Getting Started](https://github.com/microsoft/MLOpsPython/blob/master/docs/getting_started.md) guide to set up the infrastructure and pipelines to execute MLOpsPython.
+Follow the [Getting Started](getting_started.md) guide to set up the infrastructure and pipelines to execute MLOpsPython.
+
+Take a look at the [Repo Details](code_description.md) document for a description of the structure of this repository.
 
 ## Follow the Bootstrap instructions
 
-The [Bootstrap from MLOpsPython repository](https://github.com/microsoft/MLOpsPython/blob/master/bootstrap/README.md) guide will help you to quickly prepare the repository for your project.
+The [Bootstrap from MLOpsPython repository](../bootstrap/README.md) guide will help you to quickly prepare the repository for your project.
 
 **Note:** Since the bootstrap script will rename the `diabetes_regression` folder to the project name of your choice, we'll refer to your project as `[project name]` when paths are involved.
 
diff --git a/experimentation/Diabetes Ridge Regression Training.ipynb b/experimentation/Diabetes Ridge Regression Training.ipynb
@@ -16,15 +16,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
     "from sklearn.datasets import load_diabetes\n",
     "from sklearn.linear_model import Ridge\n",
     "from sklearn.metrics import mean_squared_error\n",
     "from sklearn.model_selection import train_test_split\n",
-    "import joblib"
+    "import joblib\n",
+    "import pandas as pd"
    ]
   },
   {
@@ -36,16 +37,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
-    "X, y = load_diabetes(return_X_y=True)"
+    "sample_data = load_diabetes()\n",
+    "\n",
+    "df = pd.DataFrame(\n",
+    "    data=sample_data.data,\n",
+    "    columns=sample_data.feature_names)\n",
+    "df['Y'] = sample_data.target"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -57,29 +63,12 @@
     }
    ],
    "source": [
-    "print(X.shape)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(442,)\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(y.shape)"
+    "print(df.shape)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -103,16 +92,17 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>0</th>\n",
-       "      <th>1</th>\n",
-       "      <th>2</th>\n",
-       "      <th>3</th>\n",
-       "      <th>4</th>\n",
-       "      <th>5</th>\n",
-       "      <th>6</th>\n",
-       "      <th>7</th>\n",
-       "      <th>8</th>\n",
-       "      <th>9</th>\n",
+       "      <th>age</th>\n",
+       "      <th>sex</th>\n",
+       "      <th>bmi</th>\n",
+       "      <th>bp</th>\n",
+       "      <th>s1</th>\n",
+       "      <th>s2</th>\n",
+       "      <th>s3</th>\n",
+       "      <th>s4</th>\n",
+       "      <th>s5</th>\n",
+       "      <th>s6</th>\n",
+       "      <th>Y</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
@@ -128,19 +118,21 @@
        "      <td>4.420000e+02</td>\n",
        "      <td>4.420000e+02</td>\n",
        "      <td>4.420000e+02</td>\n",
+       "      <td>442.000000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>mean</td>\n",
-       "      <td>-3.639623e-16</td>\n",
-       "      <td>1.309912e-16</td>\n",
-       "      <td>-8.013951e-16</td>\n",
-       "      <td>1.289818e-16</td>\n",
-       "      <td>-9.042540e-17</td>\n",
-       "      <td>1.301121e-16</td>\n",
-       "      <td>-4.563971e-16</td>\n",
-       "      <td>3.863174e-16</td>\n",
-       "      <td>-3.848103e-16</td>\n",
-       "      <td>-3.398488e-16</td>\n",
+       "      <td>-3.634285e-16</td>\n",
+       "      <td>1.308343e-16</td>\n",
+       "      <td>-8.045349e-16</td>\n",
+       "      <td>1.281655e-16</td>\n",
+       "      <td>-8.835316e-17</td>\n",
+       "      <td>1.327024e-16</td>\n",
+       "      <td>-4.574646e-16</td>\n",
+       "      <td>3.777301e-16</td>\n",
+       "      <td>-3.830854e-16</td>\n",
+       "      <td>-3.412882e-16</td>\n",
+       "      <td>152.133484</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>std</td>\n",
@@ -154,6 +146,7 @@
        "      <td>4.761905e-02</td>\n",
        "      <td>4.761905e-02</td>\n",
        "      <td>4.761905e-02</td>\n",
+       "      <td>77.093005</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>min</td>\n",
@@ -167,6 +160,7 @@
        "      <td>-7.639450e-02</td>\n",
        "      <td>-1.260974e-01</td>\n",
        "      <td>-1.377672e-01</td>\n",
+       "      <td>25.000000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>25%</td>\n",
@@ -180,6 +174,7 @@
        "      <td>-3.949338e-02</td>\n",
        "      <td>-3.324879e-02</td>\n",
        "      <td>-3.317903e-02</td>\n",
+       "      <td>87.000000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>50%</td>\n",
@@ -193,6 +188,7 @@
        "      <td>-2.592262e-03</td>\n",
        "      <td>-1.947634e-03</td>\n",
        "      <td>-1.077698e-03</td>\n",
+       "      <td>140.500000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>75%</td>\n",
@@ -206,6 +202,7 @@
        "      <td>3.430886e-02</td>\n",
        "      <td>3.243323e-02</td>\n",
        "      <td>2.791705e-02</td>\n",
+       "      <td>211.500000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>max</td>\n",
@@ -219,42 +216,52 @@
        "      <td>1.852344e-01</td>\n",
        "      <td>1.335990e-01</td>\n",
        "      <td>1.356118e-01</td>\n",
+       "      <td>346.000000</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                  0             1             2             3             4  \\\n",
+       "                age           sex           bmi            bp            s1  \\\n",
        "count  4.420000e+02  4.420000e+02  4.420000e+02  4.420000e+02  4.420000e+02   \n",
-       "mean  -3.639623e-16  1.309912e-16 -8.013951e-16  1.289818e-16 -9.042540e-17   \n",
+       "mean  -3.634285e-16  1.308343e-16 -8.045349e-16  1.281655e-16 -8.835316e-17   \n",
        "std    4.761905e-02  4.761905e-02  4.761905e-02  4.761905e-02  4.761905e-02   \n",
        "min   -1.072256e-01 -4.464164e-02 -9.027530e-02 -1.123996e-01 -1.267807e-01   \n",
        "25%   -3.729927e-02 -4.464164e-02 -3.422907e-02 -3.665645e-02 -3.424784e-02   \n",
        "50%    5.383060e-03 -4.464164e-02 -7.283766e-03 -5.670611e-03 -4.320866e-03   \n",
        "75%    3.807591e-02  5.068012e-02  3.124802e-02  3.564384e-02  2.835801e-02   \n",
        "max    1.107267e-01  5.068012e-02  1.705552e-01  1.320442e-01  1.539137e-01   \n",
        "\n",
-       "                  5             6             7             8             9  \n",
-       "count  4.420000e+02  4.420000e+02  4.420000e+02  4.420000e+02  4.420000e+02  \n",
-       "mean   1.301121e-16 -4.563971e-16  3.863174e-16 -3.848103e-16 -3.398488e-16  \n",
-       "std    4.761905e-02  4.761905e-02  4.761905e-02  4.761905e-02  4.761905e-02  \n",
-       "min   -1.156131e-01 -1.023071e-01 -7.639450e-02 -1.260974e-01 -1.377672e-01  \n",
-       "25%   -3.035840e-02 -3.511716e-02 -3.949338e-02 -3.324879e-02 -3.317903e-02  \n",
-       "50%   -3.819065e-03 -6.584468e-03 -2.592262e-03 -1.947634e-03 -1.077698e-03  \n",
-       "75%    2.984439e-02  2.931150e-02  3.430886e-02  3.243323e-02  2.791705e-02  \n",
-       "max    1.987880e-01  1.811791e-01  1.852344e-01  1.335990e-01  1.356118e-01  "
+       "                 s2            s3            s4            s5            s6  \\\n",
+       "count  4.420000e+02  4.420000e+02  4.420000e+02  4.420000e+02  4.420000e+02   \n",
+       "mean   1.327024e-16 -4.574646e-16  3.777301e-16 -3.830854e-16 -3.412882e-16   \n",
+       "std    4.761905e-02  4.761905e-02  4.761905e-02  4.761905e-02  4.761905e-02   \n",
+       "min   -1.156131e-01 -1.023071e-01 -7.639450e-02 -1.260974e-01 -1.377672e-01   \n",
+       "25%   -3.035840e-02 -3.511716e-02 -3.949338e-02 -3.324879e-02 -3.317903e-02   \n",
+       "50%   -3.819065e-03 -6.584468e-03 -2.592262e-03 -1.947634e-03 -1.077698e-03   \n",
+       "75%    2.984439e-02  2.931150e-02  3.430886e-02  3.243323e-02  2.791705e-02   \n",
+       "max    1.987880e-01  1.811791e-01  1.852344e-01  1.335990e-01  1.356118e-01   \n",
+       "\n",
+       "                Y  \n",
+       "count  442.000000  \n",
+       "mean   152.133484  \n",
+       "std     77.093005  \n",
+       "min     25.000000  \n",
+       "25%     87.000000  \n",
+       "50%    140.500000  \n",
+       "75%    211.500000  \n",
+       "max    346.000000  "
       ]
      },
-     "execution_count": 8,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "import pandas as pd\n",
-    "features = pd.DataFrame(X)\n",
-    "features.describe()"
+    "# All data in a single dataframe\n",
+    "df.describe()"
    ]
   },
   {
@@ -266,11 +273,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
-    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\n",
+    "X = df.drop('Y', axis=1).values\n",
+    "y = df['Y'].values\n",
+    "\n",
+    "X_train, X_test, y_train, y_test = train_test_split(\n",
+    "    X, y, test_size=0.2, random_state=0)\n",
     "data = {\"train\": {\"X\": X_train, \"y\": y_train},\n",
     "        \"test\": {\"X\": X_test, \"y\": y_test}}"
    ]
@@ -284,7 +295,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
@@ -294,16 +305,19 @@
        "      normalize=False, random_state=None, solver='auto', tol=0.001)"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "alpha = 0.5\n",
+    "# experiment parameters\n",
+    "args = {\n",
+    "    \"alpha\": 0.5\n",
+    "}\n",
     "\n",
-    "reg = Ridge(alpha=alpha)\n",
-    "reg.fit(data[\"train\"][\"X\"], data[\"train\"][\"y\"])"
+    "reg_model = Ridge(**args)\n",
+    "reg_model.fit(data[\"train\"][\"X\"], data[\"train\"][\"y\"])"
    ]
   },
   {
@@ -315,20 +329,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "mse:  3298.9096058070622\n"
+      "{'mse': 3298.9096058070622}\n"
      ]
     }
    ],
    "source": [
-    "preds = reg.predict(data[\"test\"][\"X\"])\n",
-    "print(\"mse: \", mean_squared_error(preds, y_test))"
+    "preds = reg_model.predict(data[\"test\"][\"X\"])\n",
+    "mse = mean_squared_error(preds, y_test)\n",
+    "metrics = {\"mse\": mse}\n",
+    "print(metrics)"
    ]
   },
   {
@@ -363,9 +379,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python (storedna)",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "storedna"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -377,7 +393,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.7.4"
   }
  },
  "nbformat": 4,