## Worspace

In [1]:
import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.26.0 to work with projet_7


## Compute

In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

In [3]:
cluster_name = 'cluster-projet7'

try:
    # Check for existing compute target
    training_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        training_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

Found existing cluster, use it.


## Script model

In [4]:
%%writefile lstm/model-logreg.py
print('print importing lib...')
import argparse
from azureml.core import Run
from azureml.core import Dataset

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

import pandas as pd 
import numpy as np

from collections import Counter

print('lib imported...')

# Get script arguments
parser = argparse.ArgumentParser()
parser.add_argument('--run-id', type=str, dest='run_id', help='run id to get preprocessed data')
parser.add_argument("--input-data", type=str, dest='training_dataset_id', help='training dataset')
args = parser.parse_args()

#set parameters
run_id = args.run_id
dataset_name = args.training_dataset_id

#workspace and run
run = Run.get_context()
ws = run.experiment.workspace

print('get vocab size...')
#get vocab_size from preprocessing run
run1 = ws.get_run(run_id)
v = run1.get_metrics()
vocab_size = v['vocab_size']
print('vocab size loaded...')

#get dataset
print("loading data...")
data = Dataset.get_by_name(ws, dataset_name).to_pandas_dataframe()
print("data loaded...")

#change text type from object to string
data['text'] = data['text'].astype(str)

#text to vect. Mindf=2 to get rid of term appearing only in 2 tweets
cv = CountVectorizer(min_df=2)
cv.fit(data['text'])

#split
X = cv.transform(data['text'])
y = data['label']


#splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

# Scaler
scaler = MaxAbsScaler()

# Scaling
scaler.fit(X_train)
X_train_scale = scaler.transform(X_train)
X_test_scale = scaler.transform(X_test)

#run test for different value of c
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c, solver='saga', max_iter=200)
    lr.fit(X_train_scale, y_train)
    print('Accuracy for C=%s: %s'
         % (c, accuracy_score(y_test, lr.predict(X_test_scale))))


run.complete()


Overwriting lstm/model-logreg.py


In [5]:
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.widgets import RunDetails

## Environment

In [6]:
registered_env = Environment.get(ws, 'proj7-h')

## Config

In [7]:
# Create a script config
script_config = ScriptRunConfig(source_directory='lstm',
                                script='model-logreg.py',
                                arguments = ['--input-data', 'train-pre',
                                            '--run-id', 'preprocessing_1620804387_12320a41'],
                                environment=registered_env,
                                compute_target=cluster_name) 

# submit the experiment
experiment_name = 'model-logreg'
experiment = Experiment(workspace=ws, name=experiment_name)
run = experiment.submit(config=script_config)
RunDetails(run).show()
run.wait_for_completion()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

{'runId': 'model-logreg_1621169534_9756326b',
 'target': 'cluster-projet7',
 'status': 'Completed',
 'startTimeUtc': '2021-05-16T12:56:52.927453Z',
 'endTimeUtc': '2021-05-16T13:02:29.757027Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': 'a5ca8c3a-a582-4bb4-837b-e75594558db8',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [{'dataset': {'id': 'd9db56c0-2808-4f18-8c01-1dde8e46b7e1'}, 'consumptionDetails': {'type': 'Reference'}}],
 'outputDatasets': [],
 'runDefinition': {'script': 'model-logreg.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': ['--input-data',
   'train-pre',
   '--run-id',
   'preprocessing_1620804387_12320a41'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'cluster-projet7',
  'dataReferences': {},
  'data': {},
  'outputData': {},
  'jobName': None,
  'maxRunDurationSeconds': 