In [None]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

credential = DefaultAzureCredential()

# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id="<subscription_id>",
    resource_group_name="<resource_group_name>",
    workspace_name="<workspace_name>",
)

In [None]:

my_path = "./data/spam.csv"

v1 = "initial"

my_data = Data(
    name="spam-email-data",
    version=v1,
    description="Email spam data from UCI Machine Learning Repository.",
    path=my_path,
    type=AssetTypes.URI_FILE,
)

try:
    data_asset = ml_client.data.get(name=my_data.name, version=my_data.version)
    print(
        f"Data asset already exists. Name: {my_data.name}, version: {my_data.version}"
    )
except:
    ml_client.data.create_or_update(my_data)
    print(f"Data asset created. Name: {my_data.name}, version: {my_data.version}")

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import re

In [None]:

data_asset = ml_client.data.get(name="spam-email-data", version=v1)
print(f"Data asset URI: {data_asset.path}")

df = pd.read_csv(data_asset.path)

df.head()

In [None]:

plt.bar(df['Category'].unique(), df['Category'].value_counts())
plt.show()

In [None]:

df['Message'] = df['Message'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['Message'] = df['Message'].apply(lambda x: re.sub(r'[0-9]', '', x))
df['Message'] = df['Message'].apply(lambda x: x.lower())
df['Message'] = df['Message'].apply(lambda x: x.strip())
df.head()

In [None]:
%pip install nltk

In [None]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

stop_words = set(stopwords.words('english'))

In [None]:
df['Message'] = df['Message'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))
df['Message'] = df['Message'].apply(lambda x: ' '.join([PorterStemmer().stem(word) for word in word_tokenize(x)]))
df.head()

In [None]:
%pip install wordcloud

In [None]:
from wordcloud import WordCloud

wordcloud = WordCloud(width=800, height=800, background_color='white', max_words=150).generate(' '.join(df['Message']))

plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)

In [None]:
df = df.drop(df[df['Message'] == ''].index)

In [None]:
df.to_csv('data/spam_cleaned.csv', index=False)

In [None]:

my_path = "./data/spam_cleaned.csv"

v2 = "cleaned"

my_data = Data(
    name="spam-email-data",
    version=v2,
    description="Email spam data from UCI Machine Learning Repository.",
    path=my_path,
    type=AssetTypes.URI_FILE,
)

try:
    data_asset = ml_client.data.get(name=my_data.name, version=my_data.version)
    print(
        f"Data asset already exists. Name: {my_data.name}, version: {my_data.version}"
    )
except:
    ml_client.data.create_or_update(my_data)
    print(f"Data asset created. Name: {my_data.name}, version: {my_data.version}")
