Skip to content

Commit 3ed9a90

Browse files
authored
Bootstrap fixes (#250)
1 parent 319cae6 commit 3ed9a90

File tree

4 files changed

+78
-59
lines changed

4 files changed

+78
-59
lines changed

Diff for: bootstrap/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,6 @@ To bootstrap from the existing MLOpsPython repository:
1313
1. Ensure Python 3 is installed locally
1414
1. Clone this repository locally
1515
1. Run bootstrap.py script
16-
`python bootstrap.py --d [dirpath] --n [projectname]`
16+
`python bootstrap.py -d [dirpath] -n [projectname]`
1717
* `[dirpath]` is the absolute path to the root of the directory where MLOpsPython is cloned
1818
* `[projectname]` is the name of your ML project

Diff for: bootstrap/bootstrap.py

+51-49
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,7 @@
22
import sys
33
import platform
44
import argparse
5-
# import shutil
6-
# from git import Repo
5+
import re
76

87

98
class Helper:
@@ -25,13 +24,7 @@ def project_name(self):
2524
def git_repo(self):
2625
return self._git_repo
2726

28-
# def clonerepo(self):
29-
# # Download MLOpsPython repo from git
30-
# Repo.clone_from(
31-
# self._git_repo, self._project_directory, branch="master", depth=1) # NOQA: E501
32-
# print(self._project_directory)
33-
34-
def renamefiles(self):
27+
def rename_files(self):
3528
# Rename all files starting with diabetes_regression with project name
3629
strtoreplace = "diabetes_regression"
3730
dirs = [".pipelines", r"ml_service/pipelines"]
@@ -42,10 +35,11 @@ def renamefiles(self):
4235
if(filename.find(strtoreplace) != -1):
4336
src = os.path.join(self._project_directory, normDir, filename) # NOQA: E501
4437
dst = os.path.join(self._project_directory,
45-
normDir, filename.replace(strtoreplace, self._project_name, 1)) # NOQA: E501
38+
normDir,
39+
filename.replace(strtoreplace, self._project_name, 1)) # NOQA: E501
4640
os.rename(src, dst)
4741

48-
def renamedir(self):
42+
def rename_dir(self):
4943
dir = "diabetes_regression"
5044
src = os.path.join(self._project_directory, dir)
5145
for path, subdirs, files in os.walk(src):
@@ -57,39 +51,37 @@ def renamedir(self):
5751
new_name = os.path.join(newPath, name)
5852
os.rename(file_path, new_name)
5953

60-
def deletedir(self):
54+
def delete_dir(self):
6155
# Delete unwanted directories
6256
dirs = ["docs", r"diabetes_regression"]
6357
if (platform.system() == "Windows"):
6458
cmd = 'rmdir /S /Q "{}"'
6559
else:
6660
cmd = 'rm -r "{}"'
6761
for dir in dirs:
68-
os.system(
69-
cmd.format(os.path.join(self._project_directory, os.path.normpath(dir)))) # NOQA: E501
62+
os.system(cmd.format(os.path.join(self._project_directory, os.path.normpath(dir)))) # NOQA: E501
7063

71-
def cleandir(self):
64+
def clean_dir(self):
7265
# Clean up directories
7366
dirs = ["data", "experimentation"]
7467
for dir in dirs:
7568
for root, dirs, files in os.walk(os.path.join(self._project_directory, dir)): # NOQA: E501
7669
for file in files:
7770
os.remove(os.path.join(root, file))
7871

79-
def validateargs(self):
72+
def validate_args(self):
8073
# Validate arguments
8174
if (os.path.isdir(self._project_directory) is False):
82-
raise Exception(
83-
"Not a valid directory. Please provide absolute directory path") # NOQA: E501
84-
# if (len(os.listdir(self._project_directory)) > 0):
85-
# raise Exception("Directory not empty. PLease empty directory")
86-
if(len(self._project_name) < 3 or len(self._project_name) > 15):
87-
raise Exception("Project name should be 3 to 15 chars long")
75+
raise Exception("Not a valid directory. Please provide an absolute directory path.") # NOQA: E501
76+
if (len(self._project_name) < 3 or len(self._project_name) > 15):
77+
raise Exception("Invalid project name length. Project name should be 3 to 15 chars long, letters and underscores only.") # NOQA: E501
78+
if (not re.search("^[\\w_]+$", self._project_name)):
79+
raise Exception("Invalid characters in project name. Project name should be 3 to 15 chars long, letters and underscores only.") # NOQA: E501
8880

8981

90-
def replaceprojectname(project_dir, project_name, rename_name):
82+
def replace_project_name(project_dir, project_name, rename_name):
9183
# Replace instances of rename_name within files with project_name
92-
dirs = [r".env.example",
84+
files = [r".env.example",
9385
r".pipelines/code-quality-template.yml",
9486
r".pipelines/pr.yml",
9587
r".pipelines/diabetes_regression-ci.yml",
@@ -107,42 +99,52 @@ def replaceprojectname(project_dir, project_name, rename_name):
10799
r"diabetes_regression/conda_dependencies.yml",
108100
r"diabetes_regression/evaluate/evaluate_model.py",
109101
r"diabetes_regression/register/register_model.py",
110-
r"diabetes_regression/training/test_train.py"] # NOQA: E501
102+
r"diabetes_regression/training/test_train.py"]
111103

112-
for dir in dirs:
113-
file = os.path.join(project_dir, os.path.normpath(dir))
114-
fin = open(file,
115-
"rt", encoding="utf8")
116-
data = fin.read()
117-
data = data.replace(rename_name, project_name)
118-
fin.close()
119-
fin = open(os.path.join(project_dir, file), "wt", encoding="utf8") # NOQA: E501
120-
fin.write(data)
121-
fin.close()
104+
for file in files:
105+
path = os.path.join(project_dir, os.path.normpath(file))
106+
try:
107+
with open(path, "rt", encoding="utf8") as f_in:
108+
data = f_in.read()
109+
data = data.replace(rename_name, project_name)
110+
with open(os.path.join(project_dir, file), "wt", encoding="utf8") as f_out: # NOQA: E501
111+
f_out.write(data)
112+
except IOError as e:
113+
print("Could not modify \"%s\". Is the MLOpsPython repo already cloned at \"%s\"?" % (path, project_dir)) # NOQA: E501
114+
raise e
122115

123116

124117
def main(args):
125118
parser = argparse.ArgumentParser(description='New Template')
126-
parser.add_argument("--d", type=str,
119+
parser.add_argument("-d",
120+
"--directory",
121+
type=str,
122+
required=True,
127123
help="Absolute path to new project direcory")
128-
parser.add_argument(
129-
"--n", type=str, help="Name of the project[3-15 chars] ")
124+
parser.add_argument("-n",
125+
"--name",
126+
type=str,
127+
required=True,
128+
help="Name of the project [3-15 chars, letters and underscores only]") # NOQA: E501
130129
try:
131130
args = parser.parse_args()
132-
project_directory = args.d
133-
project_name = args.n
131+
132+
project_directory = args.directory
133+
project_name = args.name
134+
134135
helper = Helper(project_directory, project_name)
135-
helper.validateargs()
136-
# helper.clonerepo()
137-
helper.cleandir()
138-
replaceprojectname(project_directory, project_name,
139-
"diabetes_regression")
140-
replaceprojectname(project_directory, project_name, "diabetes")
141-
helper.renamefiles()
142-
helper.renamedir()
143-
helper.deletedir()
136+
helper.validate_args()
137+
helper.clean_dir()
138+
139+
replace_project_name(project_directory, project_name, "diabetes_regression") # NOQA: E501
140+
replace_project_name(project_directory, project_name, "diabetes")
141+
142+
helper.rename_files()
143+
helper.rename_dir()
144+
helper.delete_dir()
144145
except Exception as e:
145146
print(e)
147+
146148
return 0
147149

148150

Diff for: ml_service/pipelines/diabetes_regression_build_train_pipeline.py

+10-9
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,10 @@
33
from azureml.pipeline.core import Pipeline, PipelineData
44
from azureml.core import Workspace, Dataset, Datastore
55
from azureml.core.runconfig import RunConfiguration
6+
from ml_service.pipelines.load_sample_data import create_sample_data_csv
67
from ml_service.util.attach_compute import get_compute
78
from ml_service.util.env_variables import Env
89
from ml_service.util.manage_environment import get_environment
9-
from sklearn.datasets import load_diabetes
10-
import pandas as pd
1110
import os
1211

1312

@@ -57,14 +56,16 @@ def main():
5756

5857
# Check to see if dataset exists
5958
if (dataset_name not in aml_workspace.datasets):
60-
# Create dataset from diabetes sample data
61-
sample_data = load_diabetes()
62-
df = pd.DataFrame(
63-
data=sample_data.data,
64-
columns=sample_data.feature_names)
65-
df['Y'] = sample_data.target
59+
# This call creates an example CSV from sklearn sample data. If you
60+
# have already bootstrapped your project, you can comment this line
61+
# out and use your own CSV.
62+
create_sample_data_csv()
63+
64+
# Use a CSV to read in the data set.
6665
file_name = 'diabetes.csv'
67-
df.to_csv(file_name, index=False)
66+
67+
if (not os.path.exists(file_name)):
68+
raise Exception("Could not find CSV dataset at \"%s\". If you have bootstrapped your project, you will need to provide a CSV." % file_name) # NOQA: E501
6869

6970
# Upload file to default datastore in workspace
7071
datatstore = Datastore.get(aml_workspace, datastore_name)

Diff for: ml_service/pipelines/load_sample_data.py

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
2+
import pandas as pd
3+
from sklearn.datasets import load_diabetes
4+
5+
6+
# Loads the diabetes sample data from sklearn and produces a csv file that can
7+
# be used by the build/train pipeline script.
8+
def create_sample_data_csv(file_name):
9+
sample_data = load_diabetes()
10+
df = pd.DataFrame(
11+
data=sample_data.data,
12+
columns=sample_data.feature_names)
13+
df['Y'] = sample_data.target
14+
# Hard code to diabetes so we fail fast if the project has been
15+
# bootstrapped.
16+
df.to_csv('diabetes.csv', index=False)

0 commit comments

Comments
 (0)