## Worspace

In [10]:
import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.26.0 to work with projet_7


## Compute

In [11]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

In [12]:
cluster_name = 'cluster-projet7'

try:
    # Check for existing compute target
    training_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        training_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

Found existing cluster, use it.


## Script model

In [13]:
%%writefile lstm/model-rnn.py
print('print importing lib...')
import argparse
from azureml.core import Run
from azureml.core import Dataset

from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, SimpleRNN, GRU
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential


from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

import pandas as pd 
import numpy as np

from collections import Counter

print('lib imported...')

# Get script arguments
parser = argparse.ArgumentParser()
parser.add_argument('--run-id', type=str, dest='run_id', help='run id to get preprocessed data')
parser.add_argument("--input-data", type=str, dest='training_dataset_id', help='training dataset')
args = parser.parse_args()

#set parameters
run_id = args.run_id
dataset_name = args.training_dataset_id

#workspace and run
run = Run.get_context()
ws = run.experiment.workspace

print('get vocab size...')
#get vocab_size from preprocessing run
run1 = ws.get_run(run_id)
v = run1.get_metrics()
vocab_size = v['vocab_size']
print('vocab size loaded...')

#get dataset
print("loading data...")
data = Dataset.get_by_name(ws, dataset_name).to_pandas_dataframe()
print("data loaded...")

#split
X1 = data.text.astype(str)
y1 = data.label

#text to int sequence
t = Tokenizer(num_words=vocab_size)
t.fit_on_texts(X1)
seq1 = t.texts_to_sequences(X1)

#padding
seq_pad1 = sequence.pad_sequences(seq1)

#number of words
num_words = len(t.word_index) + 1


#determining max length of review
max_l = seq_pad1.shape[1]

#splitting data
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(seq_pad1, y1, test_size=0.3, random_state=2)

#Model
model2 = Sequential()

#Embedding
model2.add(Embedding(vocab_size,
                     output_dim = 64,
                     input_length = max_l))

#recurrent layer
model2.add(SimpleRNN(128))

#fully connected
model2.add(Dense(128, activation='relu'))

#drop out for overfitting
model2.add(Dropout(0.2))

#output layer with sigmoid pour proba
model2.add(Dense(1, activation='sigmoid'))

#compile
history = model2.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy']) #, 'AUC'])

#fit
model2.fit(X_train1, Y_train1, epochs=50, batch_size = 128)

#Evaluate
accuracy = model2.evaluate(X_test1, Y_test1) #, auc

#load metrics in run
run.log_list('accuracy', accuracy)
#run.log('auc', auc)

run.complete()


Overwriting lstm/model-rnn.py


In [14]:
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.widgets import RunDetails

## Environment

In [15]:
registered_env = Environment.get(ws, 'proj7-h')

## Config

In [16]:
# Create a script config
script_config = ScriptRunConfig(source_directory='lstm',
                                script='model-rnn.py',
                                arguments = ['--input-data', 'train-pre',
                                            '--run-id', 'preprocessing_1621180841_b12833e2'],
                                environment=registered_env,
                                compute_target=cluster_name) 

# submit the experiment
experiment_name = 'model1'
experiment = Experiment(workspace=ws, name=experiment_name)
run = experiment.submit(config=script_config)
RunDetails(run).show()
run.wait_for_completion()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

{'runId': 'model1_1621182843_41ee3ff4',
 'target': 'cluster-projet7',
 'status': 'Completed',
 'startTimeUtc': '2021-05-16T16:38:51.320798Z',
 'endTimeUtc': '2021-05-16T21:39:02.771238Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': 'f125aa3e-a411-4a1f-a928-3414b98ec603',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [{'dataset': {'id': 'd9db56c0-2808-4f18-8c01-1dde8e46b7e1'}, 'consumptionDetails': {'type': 'Reference'}}],
 'outputDatasets': [],
 'runDefinition': {'script': 'model-rnn.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': ['--input-data',
   'train-pre',
   '--run-id',
   'preprocessing_1621180841_b12833e2'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'cluster-projet7',
  'dataReferences': {},
  'data': {},
  'outputData': {},
  'jobName': None,
  'maxRunDurationSeconds': 2592000,
