## Connection to Azure ws

In [1]:
import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.26.0 to work with projet_7


## Datastore

In [2]:
from azureml.core import Dataset

In [3]:
default_ds = ws.get_default_datastore()

## Experiment folder

In [4]:
import os

#experiment_folder = 'lstm-embedding'
#os.makedirs(experiment_folder, exist_ok=True)
#print(experiment_folder, 'folder created')

## Define environment

In [5]:
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies

In [6]:
env = Environment.from_conda_specification('proj7-h', 'env.yml')

In [7]:
#env = Environment.from_pip_requirements('proj7-1', 'lstm/requirements.txt')

In [8]:
#env = Environment.from_existing_conda_environment("proj7-e", 'proj7')

In [9]:
print(env.name, 'environment defined')

proj7-h environment defined


In [10]:
env.register(workspace=ws)

{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/intelmpi2018.3-ubuntu16.04:20210301.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "proj7-h",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "conda-forge",
                "defaults"
    

## Compute cluster

In [11]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

In [12]:
cluster_name = 'cluster-projet7'

try:
    # Check for existing compute target
    training_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        training_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

Found existing cluster, use it.


## Script preprocessing

In [13]:
%%writefile lstm/preprocessing.py
print('print importing lib...')
import argparse
from azureml.core import Run
from azureml.core import Dataset
#import joblib
import os

import numpy as np
import pandas as pd
import re
import string

from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from keras.preprocessing.text import text_to_word_sequence
from nltk.stem import PorterStemmer
from collections import Counter
import nltk
import contractions

print('lib imported...')
# Get script arguments
parser = argparse.ArgumentParser()
parser.add_argument("--input-data", type=str, dest='training_dataset_id', help='training dataset')
args = parser.parse_args()

#set parameters
dataset_name = args.training_dataset_id

#get the experiment run context and workspace
run = Run.get_context()
ws = run.experiment.workspace

#loading data
print("loading data...")
data = Dataset.get_by_name(ws, dataset_name).to_pandas_dataframe()

#download stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')

#function for url. here for explanation https://regex101.com/r/NmVGOo/8
def del_url(text):
    return re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", text)

#function for tweeter adress
def del_tweet(text):
    return re.sub('@[^\s]+', ' ', text)

#number
def del_num(text):
    return re.sub('[0-9]+', '', text)

#repeating character >3
def del_rep(text):
    return re.sub(r'(.)\1{2,}', r'\1', text)

def contraction(text):
    return contractions.fix(text)

def stemmization(list):
    porter = PorterStemmer()
    list_stem=[]
    for word in list:
        list_stem.append(porter.stem(word))
    return list_stem

def filter_stopword(list):
    list_f = [word for word in list if not word in stop_words]
    return list_f

# function for email
def del_mail(text):
    return re.sub('[^\s]+@[^\s]+', ' ', text)

def preprocess(data, col):
    #lower case
    data[col] = data[col].str.lower()
    #remove url (must come first before del tweet)
    data[col] = data[col].apply(del_url)
    #remove mail
    data[col] = data[col].apply(del_mail)
    #remove tweet adress
    data[col] = data[col].apply(del_tweet)
    #expand contraction and slang
    data[col] = data[col].apply(contraction)
    #removing punctuation. maketrans creates a table. third arguement=to remove
    # function translate avec la table/map
    map_punc = str.maketrans('','', string.punctuation)
    data[col] = data[col].str.translate(map_punc)
    #remove non ascii
    data[col] = data[col].str.encode('ascii', 'ignore').str.decode('utf-8', 'ignore')
    #remove number
    data[col] = data[col].apply(del_num)
    #remove repeating characters (eventually skip this to see difference)
    data[col] = data[col].apply(del_rep)
    #spelling/slang correction (very long)
    #data[col] = data[col].apply(spelling)
    
    #text to sequence
    data[col] = data[col].apply(text_to_word_sequence)
    #stop words
    data[col] = data[col].apply(filter_stopword)
    #stemming
    data[col] = data[col].apply(stemmization)
    
    return data

print('start preprocess...')
preprocess(data, 'text')

# vocabulary size
def get_vocab(data, col):
    voca = Counter()
    for x in data[col]:
        voca.update(x)
    vocab_low_freq=[]
    vocab_low_freq = [w for w,c in voca.most_common() if c<3]
    V = len(voca) - len(vocab_low_freq)
    return V

print('getting vocab size...')
vocab_size = get_vocab(data, 'text')
run.log('vocab_size', vocab_size)

def seq_to_text(seq):
    txt = ' '.join(seq)
    return txt

print('appying seq to text...')
data['text'] = data['text'].apply(seq_to_text)

# Save a sample of the data in the outputs folder (which gets uploaded automatically)
os.makedirs('outputs', exist_ok=True)
data.to_csv("train-pre.csv", index=False, header=True)
run.upload_file(name='outputs/train-pre.csv', path_or_stream='./train-pre.csv')


run.complete()


Overwriting lstm/preprocessing.py


In [14]:
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.widgets import RunDetails

In [15]:
#get the registered environment
registered_env = Environment.get(ws, 'proj7-h')


# Create a script config
script_config = ScriptRunConfig(source_directory='lstm',
                                script='preprocessing.py',
                                arguments = ['--input-data', 'train'],
                                environment=registered_env,
                                compute_target=cluster_name) 

# submit the experiment
experiment_name = 'preprocessing'
experiment = Experiment(workspace=ws, name=experiment_name)
run = experiment.submit(config=script_config)
RunDetails(run).show()
run.wait_for_completion()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

{'runId': 'preprocessing_1621180841_b12833e2',
 'target': 'cluster-projet7',
 'status': 'Completed',
 'startTimeUtc': '2021-05-16T16:04:34.027998Z',
 'endTimeUtc': '2021-05-16T16:10:35.429919Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': '05522235-a897-4f48-982a-69bb74685e3e',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [{'dataset': {'id': '05c73edc-44c7-449a-a368-d886a0feda0e'}, 'consumptionDetails': {'type': 'Reference'}}],
 'outputDatasets': [],
 'runDefinition': {'script': 'preprocessing.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': ['--input-data', 'train'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'cluster-projet7',
  'dataReferences': {},
  'data': {},
  'outputData': {},
  'jobName': None,
  'maxRunDurationSeconds': 2592000,
  'nodeCount': 1,
  'priority': None,
  'credential

In [17]:
metrics = run.get_metrics()
for key in metrics.keys():
        print(key, metrics.get(key))

vocab_size 6963


In [18]:
for file in run.get_file_names():
    print(file)

azureml-logs/55_azureml-execution-tvmps_fcddfe71e7c57134015c48375eca90957ca8d7b718cd24a281b8724cc9352405_p.txt
azureml-logs/65_job_prep-tvmps_fcddfe71e7c57134015c48375eca90957ca8d7b718cd24a281b8724cc9352405_p.txt
azureml-logs/70_driver_log.txt
azureml-logs/75_job_post-tvmps_fcddfe71e7c57134015c48375eca90957ca8d7b718cd24a281b8724cc9352405_p.txt
azureml-logs/process_info.json
azureml-logs/process_status.json
logs/azureml/111_azureml.log
logs/azureml/dataprep/backgroundProcess.log
logs/azureml/dataprep/backgroundProcess_Telemetry.log
logs/azureml/job_prep_azureml.log
logs/azureml/job_release_azureml.log
outputs/sample-pre.csv


In [19]:
run.get_details_with_logs()

{'runId': 'preprocessing_1620386671_6f8c3935',
 'target': 'cluster-projet7',
 'status': 'Completed',
 'startTimeUtc': '2021-05-07T11:27:29.792364Z',
 'endTimeUtc': '2021-05-07T11:29:49.302828Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': 'db65c98b-ac29-48d3-aa80-b630b55fbe6f',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [{'dataset': {'id': '8d7b3d11-ae66-4cc3-a146-8fdcb3836526'}, 'consumptionDetails': {'type': 'Reference'}}],
 'outputDatasets': [],
 'runDefinition': {'script': 'preprocessing.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': ['--input-data', 'sample'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'cluster-projet7',
  'dataReferences': {},
  'data': {},
  'outputData': {},
  'jobName': None,
  'maxRunDurationSeconds': 2592000,
  'nodeCount': 1,
  'priority': None,
  'credentia

In [24]:
run.download_file('outputs/sample-pre.csv')