Skip to content

Commit

Permalink
remove brittle dataset downloading from demos
Browse files Browse the repository at this point in the history
  • Loading branch information
mhamilton723 committed Jul 26, 2019
1 parent e572a9a commit 4ebbb41
Show file tree
Hide file tree
Showing 14 changed files with 37 additions and 243 deletions.
11 changes: 7 additions & 4 deletions build.sbt
Expand Up @@ -234,15 +234,18 @@ genBuildInfo := {

val buildInfo =
s"""
|MMLSpark Build Release Info
|MMLSpark Build and Release Information
|---------------
|
|### Maven Coordinates
| `${organization.value}:${name.value}_2.11:${version.value}`
|
|### Documentation Uploaded:
|[Scala](https://mmlspark.blob.core.windows.net/docs/${version.value}/scala/index.html)
|[Python](https://mmlspark.blob.core.windows.net/docs/${version.value}/pyspark/index.html)
|### Maven Resolver
| `https://mmlspark.azureedge.net/maven`
|
|### Documentation Pages:
|[Scala Documentation](https://mmlspark.blob.core.windows.net/docs/${version.value}/scala/index.html)
|[Python Documentation](https://mmlspark.blob.core.windows.net/docs/${version.value}/pyspark/index.html)
|
""".stripMargin

Expand Down
10 changes: 3 additions & 7 deletions notebooks/samples/Classification - Adult Census.ipynb
Expand Up @@ -34,12 +34,8 @@
"metadata": {},
"outputs": [],
"source": [
"dataFilePath = \"AdultCensusIncome.csv\"\n",
"import os, urllib\n",
"if not os.path.isfile(dataFilePath):\n",
" urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\" + dataFilePath, dataFilePath)\n",
"data = spark.createDataFrame(pd.read_csv(dataFilePath, dtype={\" hours-per-week\": np.float64}))\n",
"data = data.select([\" education\", \" marital-status\", \" hours-per-week\", \" income\"])\n",
"data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\")\n",
"data = data.select([\"education\", \"marital-status\", \"hours-per-week\", \"income\"])\n",
"train, test = data.randomSplit([0.75, 0.25], seed=123)\n",
"train.limit(10).toPandas()"
]
Expand All @@ -64,7 +60,7 @@
"source": [
"from mmlspark.train import TrainClassifier\n",
"from pyspark.ml.classification import LogisticRegression\n",
"model = TrainClassifier(model=LogisticRegression(), labelCol=\" income\", numFeatures=256).fit(train)\n",
"model = TrainClassifier(model=LogisticRegression(), labelCol=\"income\", numFeatures=256).fit(train)\n",
"model.write().overwrite().save(\"adultCensusIncomeModel.mml\")"
]
},
Expand Down
14 changes: 2 additions & 12 deletions notebooks/samples/Classification - Before and After MMLSpark.ipynb
Expand Up @@ -41,17 +41,7 @@
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from pyspark.sql.types import IntegerType, StringType, StructType, StructField\n",
"import os, urllib\n",
"\n",
"dataFilePath = \"BookReviewsFromAmazon10K.tsv\"\n",
"textSchema = StructType([StructField(\"rating\", IntegerType(), False),\n",
" StructField(\"text\", StringType(), False)])\n",
"\n",
"if not os.path.isfile(dataFilePath):\n",
" urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\" + dataFilePath, dataFilePath)\n",
"rawData = spark.createDataFrame(pd.read_csv(dataFilePath, sep=\"\\t\", header=None), textSchema)\n",
"rawData = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/BookReviewsFromAmazon10K.parquet\")\n",
"rawData.show(5)"
]
},
Expand All @@ -75,7 +65,7 @@
"outputs": [],
"source": [
"from pyspark.sql.functions import udf\n",
"from pyspark.sql.types import LongType, FloatType, DoubleType\n",
"from pyspark.sql.types import *\n",
"def wordCount(s):\n",
" return len(s.split())\n",
"def wordLength(s):\n",
Expand Down
Expand Up @@ -37,8 +37,7 @@
"\n",
"# Please note that this is a copy of the CIFAR10 dataset originally found here:\n",
"# http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz\n",
"dataFile = \"cifar-10-python.tar.gz\"\n",
"dataURL = cdnURL + \"/CIFAR10/\" + dataFile"
"imagesWithLabels = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/CIFAR10_test.parquet\")"
]
},
{
Expand All @@ -54,24 +53,11 @@
"modelDir = \"dbfs:///models/\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"mml-deploy": "local",
"collapsed": false
},
"outputs": [],
"source": [
"modelName = \"ConvNet\"\n",
"modelDir = \"file:\" + abspath(\"models\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get the model and extract the data."
"Get the model"
]
},
{
Expand All @@ -80,49 +66,8 @@
"metadata": {},
"outputs": [],
"source": [
"import os, tarfile, pickle\n",
"import urllib.request\n",
"\n",
"d = ModelDownloader(spark, modelDir)\n",
"model = d.downloadByName(modelName)\n",
"if not os.path.isfile(dataFile):\n",
" urllib.request.urlretrieve(dataURL, dataFile)\n",
"with tarfile.open(dataFile, \"r:gz\") as f:\n",
" test_dict = pickle.load(f.extractfile(\"cifar-10-batches-py/test_batch\"),\n",
" encoding=\"latin1\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Preprocess the images."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pyspark.sql.functions import col\n",
"from pyspark.sql.types import *\n",
"\n",
"def reshape_image(record):\n",
" image, label, filename = record\n",
" data = [float(x) for x in image.reshape(3,32,32).flatten()]\n",
" return data, label, filename\n",
"\n",
"convert_to_float = udf(lambda x: x, ArrayType(FloatType()))\n",
"\n",
"image_rdd = zip(test_dict[\"data\"], test_dict[\"labels\"], test_dict[\"filenames\"])\n",
"image_rdd = spark.sparkContext.parallelize(image_rdd).map(reshape_image)\n",
"\n",
"imagesWithLabels = image_rdd.toDF([\"images\", \"labels\", \"filename\"])\n",
"imagesWithLabels = imagesWithLabels.withColumn(\"images\", convert_to_float(col(\"images\")))\n",
"imagesWithLabels.printSchema()\n",
"\n",
"imagesWithLabels.cache()"
"model = d.downloadByName(modelName)\n"
]
},
{
Expand Down
47 changes: 2 additions & 45 deletions notebooks/samples/DeepLearning - Transfer Learning.ipynb
Expand Up @@ -14,7 +14,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"First, we load first batch of CIFAR-10 training data into NumPy array."
"Load DNN Model and pick one of the inner layers as feature output"
]
},
{
Expand All @@ -29,39 +29,7 @@
"from os.path import abspath\n",
"from pyspark.sql.functions import col, udf\n",
"from pyspark.sql.types import *\n",
"\n",
"cdnURL = \"https://mmlspark.azureedge.net/datasets\"\n",
"\n",
"# Please note that this is a copy of the CIFAR10 dataset originally found here:\n",
"# http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz\n",
"dataFile = \"cifar-10-python.tar.gz\"\n",
"dataURL = cdnURL + \"/CIFAR10/\" + dataFile\n",
"\n",
"if not os.path.isfile(dataFile):\n",
" urllib.request.urlretrieve(dataURL, dataFile)\n",
"with tarfile.open(dataFile, \"r:gz\") as f:\n",
" train_dict = pickle.load(f.extractfile(\"cifar-10-batches-py/data_batch_1\"),\n",
" encoding=\"latin1\")\n",
"\n",
"train_data = np.array(train_dict[\"data\"])\n",
"train_labels = np.array(train_dict[\"labels\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Load DNN Model and pick one of the inner layers as feature output"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"modelName = \"ConvNet\"\n",
"modelDir = \"wasb:///models/\"\n",
"modelDir = \"file:\" + abspath(\"models\")\n",
"d = ModelDownloader(spark, modelDir)\n",
"model = d.downloadByName(modelName)\n",
Expand All @@ -83,18 +51,7 @@
"metadata": {},
"outputs": [],
"source": [
"def reshape_image(record):\n",
" image, label = record\n",
" data = [float(x) for x in image.reshape(3,32,32).flatten()]\n",
" return data, int(label)\n",
"\n",
"convert_to_float = udf(lambda x: x, ArrayType(FloatType()))\n",
"\n",
"image_rdd = zip(train_data,train_labels)\n",
"image_rdd = spark.sparkContext.parallelize(image_rdd).map(reshape_image)\n",
"\n",
"imagesWithLabels = image_rdd.toDF([\"images\", \"labels\"])\n",
"imagesWithLabels = imagesWithLabels.withColumn(\"images\", convert_to_float(col(\"images\")))"
"imagesWithLabels = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/CIFAR10_test.parquet\")"
]
},
{
Expand Down
Expand Up @@ -17,8 +17,7 @@
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from pyspark.sql.types import IntegerType, StringType, FloatType, StructType, StructField"
"import pandas as pd\n"
]
},
{
Expand All @@ -34,21 +33,7 @@
"metadata": {},
"outputs": [],
"source": [
"dataFilePath = \"BreastCancer.csv\"\n",
"textSchema = StructType([StructField(\"Label\", IntegerType(), False),\n",
" StructField(\"Clump Thickness\", IntegerType(), False),\n",
" StructField(\"Uniformity of Cell Size\", IntegerType(), False),\n",
" StructField(\"Uniformity of Cell Shape\", IntegerType(), False),\n",
" StructField(\"Marginal Adhesion\", IntegerType(), False),\n",
" StructField(\"Single Epithelial Cell Size\", IntegerType(), False),\n",
" StructField(\"Bare Nuclei\", FloatType(), False),\n",
" StructField(\"Bland Chromatin\", IntegerType(), False),\n",
" StructField(\"Normal Nucleoli\", IntegerType(), False),\n",
" StructField(\"Mitoses\", IntegerType(), False),])\n",
"import os, urllib\n",
"if not os.path.isfile(dataFilePath):\n",
" urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\" + dataFilePath, dataFilePath)\n",
"data = spark.createDataFrame(pd.read_csv(dataFilePath, sep=\",\", header=0, na_values=\"?\"), textSchema)\n",
"data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/BreastCancer.parquet\")\n",
"tune, test = data.randomSplit([0.80, 0.20])\n",
"tune.limit(10).toPandas()"
]
Expand Down
Expand Up @@ -51,11 +51,7 @@
"metadata": {},
"outputs": [],
"source": [
"dataFile = \"On_Time_Performance_2012_9.csv\"\n",
"import os, urllib\n",
"if not os.path.isfile(dataFile):\n",
" urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\"+dataFile, dataFile)\n",
"flightDelay = spark.createDataFrame(pd.read_csv(dataFile))\n",
"flightDelay = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/On_Time_Performance_2012_9.parquet\")\n",
"# print some basic info\n",
"print(\"records read: \" + str(flightDelay.count()))\n",
"print(\"Schema: \")\n",
Expand Down
51 changes: 1 addition & 50 deletions notebooks/samples/Regression - Auto Imports.ipynb
Expand Up @@ -29,62 +29,13 @@
"using `pandas.read_csv()`"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Declare the schema for the data that will be converted from the pandas\n",
"DataFrame to a Spark DataFrame. Allow all fields to be nullable, so that\n",
"missing values can be handled appropriately, such as replacing them with\n",
"the mean or median value for that column."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from pyspark.sql.types import LongType, StringType, DoubleType, StructType, StructField\n",
"\n",
"colSchema = (\n",
" (\"symboling\", LongType), (\"normalized-losses\", DoubleType), (\"make\", StringType),\n",
" (\"fuel-type\", StringType), (\"aspiration\", StringType), (\"body-style\", StringType),\n",
" (\"drive-wheels\", StringType), (\"engine-location\", StringType), (\"wheel-base\", DoubleType),\n",
" (\"length\", DoubleType), (\"width\", DoubleType), (\"height\", DoubleType),\n",
" (\"curb-weight\", LongType), (\"engine-type\", StringType), (\"num-of-cylinders\", StringType),\n",
" (\"engine-size\", LongType), (\"fuel-system\", StringType), (\"bore\", DoubleType),\n",
" (\"stroke\", DoubleType), (\"compression-ratio\", DoubleType), (\"horsepower\", DoubleType),\n",
" (\"peak-rpm\", DoubleType), (\"city-mpg\", LongType), (\"highway-mpg\", LongType),\n",
" (\"price\", DoubleType))\n",
"\n",
"tableSchema = StructType([StructField(column[0], column[1](),True) for column in colSchema])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Read the data from the AutomobilePriceRaw.csv file into a pandas dataframe.\n",
"Specify possible reprsentations of missing values, and drop the `num-of-doors`\n",
"column as the data is read in."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dataFile = \"AutomobilePriceRaw.csv\"\n",
"import os, urllib\n",
"if not os.path.isfile(dataFile):\n",
" urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\"+dataFile, dataFile)\n",
"data = spark.createDataFrame(pd.read_csv(dataFile,\n",
" na_values=[\"\", \" \", \"?\"],\n",
" usecols=tableSchema.names),\n",
" tableSchema)"
"data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AutomobilePriceRaw.parquet\")\n"
]
},
{
Expand Down
19 changes: 4 additions & 15 deletions notebooks/samples/Regression - Flight Delays.ipynb
Expand Up @@ -37,21 +37,10 @@
"metadata": {},
"outputs": [],
"source": [
"# load raw data from small-sized 30 MB CSV file (trimmed to contain just what we use)\n",
"dataFilePath = \"On_Time_Performance_2012_9.csv\"\n",
"import os, urllib\n",
"if not os.path.isfile(dataFilePath):\n",
" urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\" + dataFilePath,\n",
" dataFilePath)\n",
"flightDelay = spark.createDataFrame(\n",
" pd.read_csv(dataFilePath,\n",
" dtype={\"Month\": np.float64, \"Quarter\": np.float64,\n",
" \"DayofMonth\": np.float64, \"DayOfWeek\": np.float64,\n",
" \"OriginAirportID\": np.float64, \"DestAirportID\": np.float64,\n",
" \"CRSDepTime\": np.float64, \"CRSArrTime\": np.float64}))\n",
"# Print information on the dataset we loaded\n",
"print(\"Records read: \" + str(flightDelay.count()))\n",
"print(\"Schema:\")\n",
"flightDelay = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/On_Time_Performance_2012_9.parquet\")\n",
"# print some basic info\n",
"print(\"records read: \" + str(flightDelay.count()))\n",
"print(\"Schema: \")\n",
"flightDelay.printSchema()\n",
"flightDelay.limit(10).toPandas()"
]
Expand Down

0 comments on commit 4ebbb41

Please sign in to comment.