In [3]:
from azureml.core import Workspace, Experiment
#change myworspace name accordingly
ws = Workspace.get(name="myworkspace")
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: myworkspace
Azure region: southcentralus
Subscription id: 26fa4a48-44da-4329-be20-b2ed37dcaadc
Resource group: sdk


In [4]:
from azureml.core.compute import ComputeTarget, AmlCompute

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###
# Choose a name for your CPU cluster
cpu_cluster_name = "cpu-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

In [34]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from sklearn.linear_model import LogisticRegression
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform
from azureml.train.hyperdrive.parameter_expressions import choice
import os

# Specify parameter sampler
### YOUR CODE HERE ###
ps = RandomParameterSampling({
    "C": uniform(0.1, 1),
    "max_iter": choice(100, 1000, 10000),
    }
)
    
# Specify a Policy
policy = BanditPolicy(slack_factor = 0.1, evaluation_interval=1, delay_evaluation=5)

if "training" not in os.listdir():
    os.mkdir("./training")

### YOUR CODE HERE ###
# Create a SKLearn estimator for use with train.py
est = SKLearn(
                source_directory='./training',
                compute_target=compute_target,
                entry_script='train.py'

            )

# Create a HyperDriveConfig using the estimator, 
#hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
                             estimator=est,
                             hyperparameter_sampling=ps,
                             policy=policy,
                             primary_metric_name="Accuracy",
                             primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                             max_total_runs=10,
                             max_concurrent_runs=1)

In [35]:
# Submit your hyperdrive run to the experiment and show run details 
#with the widget.

### YOUR CODE HERE ###
hyperdrive_run = exp.submit(hyperdrive_config)
hyperdrive_run.wait_for_completion(show_output=True)



RunId: HD_f595f46b-4d99-4063-ab3a-428003517f78
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_f595f46b-4d99-4063-ab3a-428003517f78?wsid=/subscriptions/26fa4a48-44da-4329-be20-b2ed37dcaadc/resourcegroups/sdk/workspaces/myworkspace

Streaming azureml-logs/hyperdrive.txt

"<START>[2020-12-19T09:38:36.765974][API][INFO]Experiment created<END>\n"<START>[2020-12-19T09:38:37.7165274Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>"<START>[2020-12-19T09:38:37.523840][GENERATOR][INFO]Trying to sample '1' jobs from the hyperparameter space<END>\n""<START>[2020-12-19T09:38:37.828748][GENERATOR][INFO]Successfully sampled '1' jobs, they will soon be submitted to the execution target.<END>\n"


In [36]:
import joblib
# Get your best run and save the model from that run.
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
#parameter_values = best_run.get_details()['runDefinition']['Arguments']

In [37]:
print(best_run.get_file_names())
#best_run.register_model(model_path='outputs/model.joblib',model_name='partA_model')

['azureml-logs/55_azureml-execution-tvmps_1ae7c21b074cacffc701b879668f436c56cb1e950788ac97aeedf330001ab866_d.txt', 'azureml-logs/65_job_prep-tvmps_1ae7c21b074cacffc701b879668f436c56cb1e950788ac97aeedf330001ab866_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_1ae7c21b074cacffc701b879668f436c56cb1e950788ac97aeedf330001ab866_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/100_azureml.log', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log']


In [38]:
print(best_run_metrics)
#joblib.dump(value= best_run, filename = 'model.joblib')

{'Regularization Strength:': 0.7174602059353603, 'Max iterations:': 1000, 'Accuracy': 0.9151493080844866}


In [10]:
import pandas as pd
def clean_data(data):
    # Dict for cleaning data
    months = {"jan":1, "feb":2, "mar":3, "apr":4, "may":5, "jun":6, "jul":7, "aug":8, "sep":9, "oct":10, "nov":11, "dec":12}
    weekdays = {"mon":1, "tue":2, "wed":3, "thu":4, "fri":5, "sat":6, "sun":7}

    # Clean and one hot encode data
    x_df = data.to_pandas_dataframe().dropna()
    jobs = pd.get_dummies(x_df.job, prefix="job")
    x_df.drop("job", inplace=True, axis=1)
    x_df = x_df.join(jobs)
    x_df["marital"] = x_df.marital.apply(lambda s: 1 if s == "married" else 0)
    x_df["default"] = x_df.default.apply(lambda s: 1 if s == "yes" else 0)
    x_df["housing"] = x_df.housing.apply(lambda s: 1 if s == "yes" else 0)
    x_df["loan"] = x_df.loan.apply(lambda s: 1 if s == "yes" else 0)
    contact = pd.get_dummies(x_df.contact, prefix="contact")
    x_df.drop("contact", inplace=True, axis=1)
    x_df = x_df.join(contact)
    education = pd.get_dummies(x_df.education, prefix="education")
    x_df.drop("education", inplace=True, axis=1)
    x_df = x_df.join(education)
    x_df["month"] = x_df.month.map(months)
    x_df["day_of_week"] = x_df.day_of_week.map(weekdays)
    x_df["poutcome"] = x_df.poutcome.apply(lambda s: 1 if s == "success" else 0)

    y_df = x_df.pop("y").apply(lambda s: 1 if s == "yes" else 0)
    return x_df,y_df

In [11]:
from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.core import Dataset, Datastore
from sklearn.model_selection import train_test_split

data_path='https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv'
ds =Dataset.Tabular.from_delimited_files(path=data_path)
x,y=clean_data(ds)
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=100)


In [12]:
from sklearn.linear_model import LogisticRegression
best_model = LogisticRegression(C=0.5617262330057384, max_iter=1000).fit(x_train, y_train)
accuracy = best_model.score(x_test, y_test)

In [13]:
best_model_2=LogisticRegression(C=0.7174602059353603, max_iter=1000).fit(x_train, y_train)
accuracy2 = best_model.score(x_test, y_test)
print(accuracy2)


0.9150279193979121


In [17]:
from sklearn.metrics import f1_score
y_pred=best_model.predict(x_test)
f1_score(y_test, y_pred, average='macro')

0.7316430559464615