remove brittle dataset downloading from demos

microsoft · Jul 26, 2019 · 4ebbb41 · 4ebbb41
1 parent e572a9a
commit 4ebbb41
Show file tree

Hide file tree

Showing 14 changed files with 37 additions and 243 deletions.
diff --git a/build.sbt b/build.sbt
@@ -234,15 +234,18 @@ genBuildInfo := {
 
   val buildInfo =
     s"""
-      |MMLSpark Build Release Info
+      |MMLSpark Build and Release Information
       |---------------
       |
       |### Maven Coordinates
       | `${organization.value}:${name.value}_2.11:${version.value}`
       | 
-      |### Documentation Uploaded:
-      |[Scala](https://mmlspark.blob.core.windows.net/docs/${version.value}/scala/index.html)
-      |[Python](https://mmlspark.blob.core.windows.net/docs/${version.value}/pyspark/index.html)
+      |### Maven Resolver
+      | `https://mmlspark.azureedge.net/maven`
+      | 
+      |### Documentation Pages:
+      |[Scala Documentation](https://mmlspark.blob.core.windows.net/docs/${version.value}/scala/index.html)
+      |[Python Documentation](https://mmlspark.blob.core.windows.net/docs/${version.value}/pyspark/index.html)
       |
     """.stripMargin
 

diff --git a/notebooks/samples/Classification - Adult Census.ipynb b/notebooks/samples/Classification - Adult Census.ipynb
@@ -34,12 +34,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataFilePath = \"AdultCensusIncome.csv\"\n",
-    "import os, urllib\n",
-    "if not os.path.isfile(dataFilePath):\n",
-    "    urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\" + dataFilePath, dataFilePath)\n",
-    "data = spark.createDataFrame(pd.read_csv(dataFilePath, dtype={\" hours-per-week\": np.float64}))\n",
-    "data = data.select([\" education\", \" marital-status\", \" hours-per-week\", \" income\"])\n",
+    "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\")\n",
+    "data = data.select([\"education\", \"marital-status\", \"hours-per-week\", \"income\"])\n",
     "train, test = data.randomSplit([0.75, 0.25], seed=123)\n",
     "train.limit(10).toPandas()"
    ]
@@ -64,7 +60,7 @@
    "source": [
     "from mmlspark.train import TrainClassifier\n",
     "from pyspark.ml.classification import LogisticRegression\n",
-    "model = TrainClassifier(model=LogisticRegression(), labelCol=\" income\", numFeatures=256).fit(train)\n",
+    "model = TrainClassifier(model=LogisticRegression(), labelCol=\"income\", numFeatures=256).fit(train)\n",
     "model.write().overwrite().save(\"adultCensusIncomeModel.mml\")"
    ]
   },

diff --git a/notebooks/samples/Classification - Before and After MMLSpark.ipynb b/notebooks/samples/Classification - Before and After MMLSpark.ipynb
@@ -41,17 +41,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pandas as pd\n",
-    "from pyspark.sql.types import IntegerType, StringType, StructType, StructField\n",
-    "import os, urllib\n",
-    "\n",
-    "dataFilePath = \"BookReviewsFromAmazon10K.tsv\"\n",
-    "textSchema = StructType([StructField(\"rating\", IntegerType(), False),\n",
-    "                         StructField(\"text\", StringType(), False)])\n",
-    "\n",
-    "if not os.path.isfile(dataFilePath):\n",
-    "    urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\" + dataFilePath, dataFilePath)\n",
-    "rawData = spark.createDataFrame(pd.read_csv(dataFilePath, sep=\"\\t\", header=None), textSchema)\n",
+    "rawData = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/BookReviewsFromAmazon10K.parquet\")\n",
     "rawData.show(5)"
    ]
   },
@@ -75,7 +65,7 @@
    "outputs": [],
    "source": [
     "from pyspark.sql.functions import udf\n",
-    "from pyspark.sql.types import LongType, FloatType, DoubleType\n",
+    "from pyspark.sql.types import *\n",
     "def wordCount(s):\n",
     "    return len(s.split())\n",
     "def wordLength(s):\n",

diff --git a/notebooks/samples/DeepLearning - CIFAR10 Convolutional Network.ipynb b/notebooks/samples/DeepLearning - CIFAR10 Convolutional Network.ipynb
@@ -37,8 +37,7 @@
     "\n",
     "# Please note that this is a copy of the CIFAR10 dataset originally found here:\n",
     "# http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz\n",
-    "dataFile = \"cifar-10-python.tar.gz\"\n",
-    "dataURL = cdnURL + \"/CIFAR10/\" + dataFile"
+    "imagesWithLabels = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/CIFAR10_test.parquet\")"
    ]
   },
   {
@@ -54,24 +53,11 @@
     "modelDir = \"dbfs:///models/\""
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "mml-deploy": "local",
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "modelName = \"ConvNet\"\n",
-    "modelDir = \"file:\" + abspath(\"models\")"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Get the model and extract the data."
+    "Get the model"
    ]
   },
   {
@@ -80,49 +66,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os, tarfile, pickle\n",
-    "import urllib.request\n",
-    "\n",
     "d = ModelDownloader(spark, modelDir)\n",
-    "model = d.downloadByName(modelName)\n",
-    "if not os.path.isfile(dataFile):\n",
-    "    urllib.request.urlretrieve(dataURL, dataFile)\n",
-    "with tarfile.open(dataFile, \"r:gz\") as f:\n",
-    "    test_dict = pickle.load(f.extractfile(\"cifar-10-batches-py/test_batch\"),\n",
-    "                            encoding=\"latin1\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Preprocess the images."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pyspark.sql.functions import col\n",
-    "from pyspark.sql.types import *\n",
-    "\n",
-    "def reshape_image(record):\n",
-    "    image, label, filename = record\n",
-    "    data = [float(x) for x in image.reshape(3,32,32).flatten()]\n",
-    "    return data, label, filename\n",
-    "\n",
-    "convert_to_float = udf(lambda x: x, ArrayType(FloatType()))\n",
-    "\n",
-    "image_rdd = zip(test_dict[\"data\"], test_dict[\"labels\"], test_dict[\"filenames\"])\n",
-    "image_rdd = spark.sparkContext.parallelize(image_rdd).map(reshape_image)\n",
-    "\n",
-    "imagesWithLabels = image_rdd.toDF([\"images\", \"labels\", \"filename\"])\n",
-    "imagesWithLabels = imagesWithLabels.withColumn(\"images\", convert_to_float(col(\"images\")))\n",
-    "imagesWithLabels.printSchema()\n",
-    "\n",
-    "imagesWithLabels.cache()"
+    "model = d.downloadByName(modelName)\n"
    ]
   },
   {

diff --git a/notebooks/samples/DeepLearning - Transfer Learning.ipynb b/notebooks/samples/DeepLearning - Transfer Learning.ipynb
@@ -14,7 +14,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "First, we load first batch of CIFAR-10 training data into NumPy array."
+    "Load DNN Model and pick one of the inner layers as feature output"
    ]
   },
   {
@@ -29,39 +29,7 @@
     "from os.path import abspath\n",
     "from pyspark.sql.functions import col, udf\n",
     "from pyspark.sql.types import *\n",
-    "\n",
-    "cdnURL = \"https://mmlspark.azureedge.net/datasets\"\n",
-    "\n",
-    "# Please note that this is a copy of the CIFAR10 dataset originally found here:\n",
-    "# http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz\n",
-    "dataFile = \"cifar-10-python.tar.gz\"\n",
-    "dataURL = cdnURL + \"/CIFAR10/\" + dataFile\n",
-    "\n",
-    "if not os.path.isfile(dataFile):\n",
-    "    urllib.request.urlretrieve(dataURL, dataFile)\n",
-    "with tarfile.open(dataFile, \"r:gz\") as f:\n",
-    "    train_dict = pickle.load(f.extractfile(\"cifar-10-batches-py/data_batch_1\"),\n",
-    "                             encoding=\"latin1\")\n",
-    "\n",
-    "train_data = np.array(train_dict[\"data\"])\n",
-    "train_labels = np.array(train_dict[\"labels\"])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Load DNN Model and pick one of the inner layers as feature output"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
     "modelName = \"ConvNet\"\n",
-    "modelDir = \"wasb:///models/\"\n",
     "modelDir = \"file:\" + abspath(\"models\")\n",
     "d = ModelDownloader(spark, modelDir)\n",
     "model = d.downloadByName(modelName)\n",
@@ -83,18 +51,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def reshape_image(record):\n",
-    "    image, label = record\n",
-    "    data = [float(x) for x in image.reshape(3,32,32).flatten()]\n",
-    "    return data, int(label)\n",
-    "\n",
-    "convert_to_float = udf(lambda x: x, ArrayType(FloatType()))\n",
-    "\n",
-    "image_rdd = zip(train_data,train_labels)\n",
-    "image_rdd = spark.sparkContext.parallelize(image_rdd).map(reshape_image)\n",
-    "\n",
-    "imagesWithLabels = image_rdd.toDF([\"images\", \"labels\"])\n",
-    "imagesWithLabels = imagesWithLabels.withColumn(\"images\", convert_to_float(col(\"images\")))"
+    "imagesWithLabels = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/CIFAR10_test.parquet\")"
    ]
   },
   {

diff --git a/notebooks/samples/HyperParameterTuning - Fighting Breast Cancer.ipynb b/notebooks/samples/HyperParameterTuning - Fighting Breast Cancer.ipynb
@@ -17,8 +17,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pandas as pd\n",
-    "from pyspark.sql.types import IntegerType, StringType, FloatType, StructType, StructField"
+    "import pandas as pd\n"
    ]
   },
   {
@@ -34,21 +33,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataFilePath = \"BreastCancer.csv\"\n",
-    "textSchema = StructType([StructField(\"Label\", IntegerType(), False),\n",
-    "                         StructField(\"Clump Thickness\", IntegerType(), False),\n",
-    "                         StructField(\"Uniformity of Cell Size\", IntegerType(), False),\n",
-    "                         StructField(\"Uniformity of Cell Shape\", IntegerType(), False),\n",
-    "                         StructField(\"Marginal Adhesion\", IntegerType(), False),\n",
-    "                         StructField(\"Single Epithelial Cell Size\", IntegerType(), False),\n",
-    "                         StructField(\"Bare Nuclei\", FloatType(), False),\n",
-    "                         StructField(\"Bland Chromatin\", IntegerType(), False),\n",
-    "                         StructField(\"Normal Nucleoli\", IntegerType(), False),\n",
-    "                         StructField(\"Mitoses\", IntegerType(), False),])\n",
-    "import os, urllib\n",
-    "if not os.path.isfile(dataFilePath):\n",
-    "    urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\" + dataFilePath, dataFilePath)\n",
-    "data = spark.createDataFrame(pd.read_csv(dataFilePath, sep=\",\", header=0, na_values=\"?\"), textSchema)\n",
+    "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/BreastCancer.parquet\")\n",
     "tune, test = data.randomSplit([0.80, 0.20])\n",
     "tune.limit(10).toPandas()"
    ]

diff --git a/notebooks/samples/Regression - Flight Delays with DataCleaning.ipynb b/notebooks/samples/Regression - Flight Delays with DataCleaning.ipynb
@@ -51,11 +51,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataFile = \"On_Time_Performance_2012_9.csv\"\n",
-    "import os, urllib\n",
-    "if not os.path.isfile(dataFile):\n",
-    "    urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\"+dataFile, dataFile)\n",
-    "flightDelay = spark.createDataFrame(pd.read_csv(dataFile))\n",
+    "flightDelay = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/On_Time_Performance_2012_9.parquet\")\n",
     "# print some basic info\n",
     "print(\"records read: \" + str(flightDelay.count()))\n",
     "print(\"Schema: \")\n",

diff --git a/notebooks/samples/Regression - Auto Imports.ipynb b/notebooks/samples/Regression - Auto Imports.ipynb
@@ -29,62 +29,13 @@
     "using `pandas.read_csv()`"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Declare the schema for the data that will be converted from the pandas\n",
-    "DataFrame to a Spark DataFrame.  Allow all fields to be nullable, so that\n",
-    "missing values can be handled appropriately, such as replacing them with\n",
-    "the mean or median value for that column."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "from pyspark.sql.types import LongType, StringType, DoubleType, StructType, StructField\n",
-    "\n",
-    "colSchema = (\n",
-    "    (\"symboling\", LongType), (\"normalized-losses\", DoubleType), (\"make\", StringType),\n",
-    "    (\"fuel-type\", StringType), (\"aspiration\", StringType), (\"body-style\", StringType),\n",
-    "    (\"drive-wheels\", StringType), (\"engine-location\", StringType), (\"wheel-base\", DoubleType),\n",
-    "    (\"length\", DoubleType), (\"width\", DoubleType), (\"height\", DoubleType),\n",
-    "    (\"curb-weight\", LongType), (\"engine-type\", StringType), (\"num-of-cylinders\", StringType),\n",
-    "    (\"engine-size\", LongType), (\"fuel-system\", StringType), (\"bore\", DoubleType),\n",
-    "    (\"stroke\", DoubleType), (\"compression-ratio\", DoubleType), (\"horsepower\", DoubleType),\n",
-    "    (\"peak-rpm\", DoubleType), (\"city-mpg\", LongType), (\"highway-mpg\", LongType),\n",
-    "    (\"price\", DoubleType))\n",
-    "\n",
-    "tableSchema = StructType([StructField(column[0], column[1](),True) for column in colSchema])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Read the data from the AutomobilePriceRaw.csv file into a pandas dataframe.\n",
-    "Specify possible reprsentations of missing values, and drop the `num-of-doors`\n",
-    "column as the data is read in."
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataFile = \"AutomobilePriceRaw.csv\"\n",
-    "import os, urllib\n",
-    "if not os.path.isfile(dataFile):\n",
-    "    urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\"+dataFile, dataFile)\n",
-    "data = spark.createDataFrame(pd.read_csv(dataFile,\n",
-    "                                         na_values=[\"\", \" \", \"?\"],\n",
-    "                                         usecols=tableSchema.names),\n",
-    "                             tableSchema)"
+    "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AutomobilePriceRaw.parquet\")\n"
    ]
   },
   {

diff --git a/notebooks/samples/Regression - Flight Delays.ipynb b/notebooks/samples/Regression - Flight Delays.ipynb
@@ -37,21 +37,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# load raw data from small-sized 30 MB CSV file (trimmed to contain just what we use)\n",
-    "dataFilePath = \"On_Time_Performance_2012_9.csv\"\n",
-    "import os, urllib\n",
-    "if not os.path.isfile(dataFilePath):\n",
-    "    urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\" + dataFilePath,\n",
-    "                               dataFilePath)\n",
-    "flightDelay = spark.createDataFrame(\n",
-    "    pd.read_csv(dataFilePath,\n",
-    "                dtype={\"Month\": np.float64, \"Quarter\": np.float64,\n",
-    "                       \"DayofMonth\": np.float64, \"DayOfWeek\": np.float64,\n",
-    "                       \"OriginAirportID\": np.float64, \"DestAirportID\": np.float64,\n",
-    "                       \"CRSDepTime\": np.float64, \"CRSArrTime\": np.float64}))\n",
-    "# Print information on the dataset we loaded\n",
-    "print(\"Records read: \" + str(flightDelay.count()))\n",
-    "print(\"Schema:\")\n",
+    "flightDelay = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/On_Time_Performance_2012_9.parquet\")\n",
+    "# print some basic info\n",
+    "print(\"records read: \" + str(flightDelay.count()))\n",
+    "print(\"Schema: \")\n",
     "flightDelay.printSchema()\n",
     "flightDelay.limit(10).toPandas()"
    ]