## Connection to Azure ws

In [1]:
import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.26.0 to work with projet_7


## Datastore

In [2]:
from azureml.core import Dataset

In [3]:
default_ds = ws.get_default_datastore()

## Define environment

In [4]:
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies

In [5]:
#env = Environment.from_conda_specification('proj7-h', 'env.yml')
#env.register(workspace=ws)

## Compute cluster

In [6]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

In [7]:
cluster_name = 'cluster-projet7'

try:
    # Check for existing compute target
    training_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        training_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

Found existing cluster, use it.


## Script preprocessing

In [8]:
%%writefile lstm/glove.py
print('print importing lib...')
import argparse
from azureml.core import Run
from azureml.core import Dataset
#import joblib
import os

import numpy as np
import pandas as pd
import re
import string
import pathlib


from keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, SimpleRNN, GRU
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential, model_from_json
from keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import load_model

from sklearn.model_selection import train_test_split
from collections import Counter

import nltk
import contractions

print('lib imported...')
# Get script arguments
parser = argparse.ArgumentParser()
parser.add_argument("--input-data", type=str, dest='training_dataset_id', help='training dataset')
parser.add_argument("--glove-weight", type=str, dest='glove_dataset_id', help='glove weight')
parser.add_argument("--batch-size", type=int, dest='batch_size')
parser.add_argument("--epoch", type=int, dest='epoch')
parser.add_argument("--glove-dim", type=int, dest='glove_dim')
args = parser.parse_args()

#set parameters
dataset_name = args.training_dataset_id
glove_name = args.glove_dataset_id
batch_size = args.batch_size
epoch = args.epoch
glove_dim = args.glove_dim

#get the experiment run context and workspace
run = Run.get_context()
ws = run.experiment.workspace

#loading data
print("loading data...")
data = Dataset.get_by_name(ws, dataset_name).to_pandas_dataframe()
glove = Dataset.get_by_name(ws, glove_name).to_pandas_dataframe()

################################################################################################################################
#                FUNCTION DEFINITION
################################################################################################################################

FLAGS = re.MULTILINE | re.DOTALL

#group = regex Return the string matched by the RE.SUB (several match by tweet)

def hashtag(text):
    text = text.group()
    hashtag_body = text[1:]
    if hashtag_body.isupper():
        result = "<hashtag> {} <allcaps>".format(hashtag_body.lower())
    else:
        result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS))
    return result

def allcaps(text):
    text = text.group()
    return text.lower()+" <allcaps> "

def repeat(text):
    text = text.group()
    t = re.sub(r'(.)\1{2,}', r'\1', text)
    if text == t:
        return text
    else:
        return t+' <repeat> '

def pps_glove(text):
    # Different regex parts to combined for smiley faces  
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    # function so code less repetitive
    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    #separator for backslash to identify the two words 
    text = re_sub(r"/"," / ")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"#\w+", hashtag)  
    # tag in the word from the repeating letter until the end yeeees ==> text=eees =transform=> es <repeat>
    text = re_sub(r'(.)\1{2,}\w+', repeat)
    # tag repeating letter with a space just before (for this !!!!!!)
    text = re_sub(r' (.)\1{2,}', repeat)
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")
    text = re_sub(r"([a-zA-Z<>()])([?!.:;,])", r"\1 \2")
    text = re_sub(r"\(([a-zA-Z<>]+)\)", r"( \1 )")
    #flag allcaps 
    text = re_sub(r" ([A-Z]){2,} ", allcaps)

    
    return text.lower()

def contraction(text):
    return contractions.fix(text)

def remove_apostrophe(text):
    return re.sub(r"['`´()]", r" ", text, flags=FLAGS)

def seq_to_text(seq):
    txt = ' '.join(seq)
    return txt

##################################################################################################################################
#                                                     PREPROCESSING
#################################################################################################################################


print('start preprocess...')

#main function
data['text'] = data['text'].apply(pps_glove)

#contraction (after smiley and flag)
data['text'] = data['text'].apply(contraction)

#apostrophre separation for you're, brother's, i'm etc (after contraction) replace with a space
data['text'] = data['text'].apply(remove_apostrophe)

#turn into word sequence for counter
data['text'] = data['text'].apply(lambda x: x.split())

################################################################################################################################
#                                                 VOCABULARY and EMBEDDING MATRIX
################################################################################################################################

# load embedding vector from glove xxx dimension into a dict
coefs = []
word = []
word = [w for w in glove.iloc[:,0].values]
coefs = [val for val in glove.iloc[:,1:].values]
embeddings_index = dict(zip(word, coefs))

#vocabulary
vocab = Counter()
for x in data['text']:
    vocab.update(x)
    
#exatract words appearing only once or twice in the corpus
vocab_low_freq = []
vocab_low_freq = [w for w,c in vocab.most_common() if c<3]

#filter
for w in vocab_low_freq:
    del vocab[w]

vocab_size = len(vocab)

#back to text for tokenizer entry
print('applying seq to text...')
data['text'] = data['text'].apply(seq_to_text)

################################################################################################################################
#                                                       TOKENIZER AND SPLIT
################################################################################################################################

X1 = data.text.astype(str)
y1 = data.label

#Tokenizer / seq and padding
t = Tokenizer(num_words=vocab_size)
t.fit_on_texts(X1)
seq1 = t.texts_to_sequences(X1)

#padding
seq_pad1 = sequence.pad_sequences(seq1)

#embedding matrix de dimension vocab_size glove_dim match our vocabulary with glove vocab
embedding_matrix = np.zeros((len(t.word_index)+1, glove_dim))

for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
         # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
#determining max length of review
max_l = seq_pad1.shape[1]
        
#split
X_train1, X_val1, Y_train1, Y_val1 = train_test_split(seq_pad1, y1, test_size=0.15, random_state=2)

################################################################################################################################
#                                                       MODEL
################################################################################################################################

model2 = Sequential()

#Embedding
model2.add(Embedding(len(t.word_index)+1,
                     output_dim = glove_dim,
                     weights=[embedding_matrix],
                     input_length = max_l,
                     #training false so that weigth are not updated
                    trainable=False))

#recurrent layer
model2.add(LSTM(128, 
                #basique activation tanh
                activation = 'tanh',
                #return seq false unless other LSTM layer
                return_sequences=False, 
                dropout=0.1))

#fully connected
model2.add(Dense(128, activation='relu'))

#drop out for overfitting
model2.add(Dropout(0.2))

#output layer with sigmoid pour proba
model2.add(Dense(1, activation='sigmoid'))

#compile
history = model2.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy']) #, 'AUC'])

#Adding callbacks
es = EarlyStopping(monitor='loss', mode='min', verbose=1,patience=3)  
mc = ModelCheckpoint('best_model_glove', monitor='accuracy', mode='max', save_best_only=True,verbose=1)  

################################################################################################################################
#                                                       TRAINING
################################################################################################################################

model2.fit(X_train1,
           Y_train1,
           epochs=epoch,
           batch_size = batch_size,
           validation_data = (X_val1, Y_val1),
           callbacks=[es, mc])

#Evaluate
accuracy = model2.evaluate(X_val1, Y_val1) #, auc

#load metrics in run
run.log_list('accuracy', accuracy)


# Save model
os.makedirs('outputs', exist_ok=True)
os.makedirs('outputs/model', exist_ok=True)
# serialize NN architecture to JSON
model_json = model2.to_json()
# save model JSON
with open('./outputs/model/model.json', 'w') as f:
    f.write(model_json)
# save model weights
model2.save_weights('./outputs/model/model.glove')
print("model saved in ./outputs/model folder")



run.complete()


Overwriting lstm/glove.py


In [9]:
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.widgets import RunDetails

In [None]:
#get the registered environment
registered_env = Environment.get(ws, 'proj7-h')


# Create a script config
script_config = ScriptRunConfig(source_directory='lstm',
                                script='glove.py',
                                arguments = ['--input-data', 'train',
                                            '--glove-dim', 25,
                                            '--glove-weight', 'glove-25d',
                                            '--batch-size', 128,
                                            '--epoch', 50],
                                environment=registered_env,
                                compute_target=cluster_name) 

# submit the experiment
experiment_name = 'glove'
experiment = Experiment(workspace=ws, name=experiment_name)
run = experiment.submit(config=script_config)
RunDetails(run).show()
run.wait_for_completion()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [17]:
metrics = run.get_metrics()
for key in metrics.keys():
        print(key, metrics.get(key))

vocab_size 6963


In [18]:
for file in run.get_file_names():
    print(file)

azureml-logs/55_azureml-execution-tvmps_fcddfe71e7c57134015c48375eca90957ca8d7b718cd24a281b8724cc9352405_p.txt
azureml-logs/65_job_prep-tvmps_fcddfe71e7c57134015c48375eca90957ca8d7b718cd24a281b8724cc9352405_p.txt
azureml-logs/70_driver_log.txt
azureml-logs/75_job_post-tvmps_fcddfe71e7c57134015c48375eca90957ca8d7b718cd24a281b8724cc9352405_p.txt
azureml-logs/process_info.json
azureml-logs/process_status.json
logs/azureml/111_azureml.log
logs/azureml/dataprep/backgroundProcess.log
logs/azureml/dataprep/backgroundProcess_Telemetry.log
logs/azureml/job_prep_azureml.log
logs/azureml/job_release_azureml.log
outputs/sample-pre.csv
