# Data science in Microsoft Fabric

In [1]:
# Azure storage access info for open dataset diabetes
blob_account_name = "azureopendatastorage"
blob_container_name = "mlsamples"
blob_relative_path = "diabetes"
blob_sas_token = r"" # Blank since container is Anonymous access
    
# Set Spark config to access  blob storage
wasbs_path = f"wasbs://%s@%s.blob.core.windows.net/%s" % (blob_container_name, blob_account_name, blob_relative_path)
spark.conf.set("fs.azure.sas.%s.%s.blob.core.windows.net" % (blob_container_name, blob_account_name), blob_sas_token)
print("Remote blob path: " + wasbs_path)
    
# Spark read parquet, note that it won't load any data yet by now
df = spark.read.parquet(wasbs_path)

StatementMeta(, 63446567-947e-42f1-b388-e261690dd809, 3, Finished, Available, Finished)

Remote blob path: wasbs://mlsamples@azureopendatastorage.blob.core.windows.net/diabetes


In [2]:
display(df)

StatementMeta(, 63446567-947e-42f1-b388-e261690dd809, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 5a20ccb8-1d80-4e6c-9383-e7836e0a5b81)

In [3]:
df = df.toPandas()
df.head()

StatementMeta(, 63446567-947e-42f1-b388-e261690dd809, 5, Finished, Available, Finished)

Unnamed: 0,AGE,SEX,BMI,BP,S1,S2,S3,S4,S5,S6,Y
0,59,2,32.1,101.0,157,93.2,38.0,4.0,4.8598,87,151
1,48,1,21.6,87.0,183,103.2,70.0,3.0,3.8918,69,75
2,72,2,30.5,93.0,156,93.6,41.0,4.0,4.6728,85,141
3,24,1,25.3,84.0,198,131.4,40.0,5.0,4.8903,89,206
4,50,1,23.0,101.0,192,125.4,52.0,4.0,4.2905,80,135


In [5]:
# Code generated by Data Wrangler for pandas DataFrame

def clean_data(df):
    # Created column 'Risk' from formula
    df['Risk'] = (df['Y'] > 211.5).astype(int)
    return df

df_clean = clean_data(df.copy())
display(df_clean)

StatementMeta(, 63446567-947e-42f1-b388-e261690dd809, 29, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 750a5528-8172-4d93-88ab-b4a73b70f782)

In [6]:
df_clean.describe()

StatementMeta(, 63446567-947e-42f1-b388-e261690dd809, 30, Finished, Available, Finished)

Unnamed: 0,AGE,SEX,BMI,BP,S1,S2,S3,S4,S5,S6,Y,Risk
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,48.5181,1.468326,26.375792,94.647014,189.140271,115.43914,49.788462,4.070249,4.641411,91.260181,152.133484,0.251131
std,13.109028,0.499561,4.418122,13.831283,34.608052,30.413081,12.934202,1.29045,0.522391,11.496335,77.093005,0.434155
min,19.0,1.0,18.0,62.0,97.0,41.6,22.0,2.0,3.2581,58.0,25.0,0.0
25%,38.25,1.0,23.2,84.0,164.25,96.05,40.25,3.0,4.2767,83.25,87.0,0.0
50%,50.0,1.0,25.7,93.0,186.0,113.0,48.0,4.0,4.62005,91.0,140.5,0.0
75%,59.0,2.0,29.275,105.0,209.75,134.5,57.75,5.0,4.9972,98.0,211.5,0.75
max,79.0,2.0,42.2,133.0,301.0,242.4,99.0,9.09,6.107,124.0,346.0,1.0


### Train a regression model

In [7]:
from sklearn.model_selection import train_test_split
    
X, y = df_clean[['AGE','SEX','BMI','BP','S1','S2','S3','S4','S5','S6']].values, df_clean['Y'].values
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

StatementMeta(, 63446567-947e-42f1-b388-e261690dd809, 31, Finished, Available, Finished)

In [8]:
import mlflow
experiment_name = "diabetes-regression"
mlflow.set_experiment(experiment_name)

StatementMeta(, 63446567-947e-42f1-b388-e261690dd809, 32, Finished, Available, Finished)

2025/08/25 04:25:37 INFO mlflow.tracking.fluent: Experiment with name 'diabetes-regression' does not exist. Creating a new experiment.


<Experiment: artifact_location='sds://onelakeaustraliaeast.pbidedicated.windows.net/b5231dbc-1ecd-4d45-b832-08b3e01d8c1f/71de543a-9e78-42f3-8154-58f8f7ee7440', creation_time=1756095942285, experiment_id='71de543a-9e78-42f3-8154-58f8f7ee7440', last_update_time=1756095942285, lifecycle_stage='active', name='diabetes-regression', tags={}>

In [12]:
from sklearn.linear_model import LinearRegression
    
with mlflow.start_run():
   mlflow.autolog()
    
   model = LinearRegression()
   model.fit(X_train, y_train)

StatementMeta(, 63446567-947e-42f1-b388-e261690dd809, 36, Finished, Available, Finished)

### Train a classification model

In [9]:
from sklearn.model_selection import train_test_split
    
X, y = df_clean[['AGE','SEX','BMI','BP','S1','S2','S3','S4','S5','S6']].values, df_clean['Risk'].values
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

StatementMeta(, 63446567-947e-42f1-b388-e261690dd809, 33, Finished, Available, Finished)

In [10]:
import mlflow
experiment_name = "diabetes-classification"
mlflow.set_experiment(experiment_name)

StatementMeta(, 63446567-947e-42f1-b388-e261690dd809, 34, Finished, Available, Finished)

2025/08/25 04:26:41 INFO mlflow.tracking.fluent: Experiment with name 'diabetes-classification' does not exist. Creating a new experiment.


<Experiment: artifact_location='sds://onelakeaustraliaeast.pbidedicated.windows.net/b5231dbc-1ecd-4d45-b832-08b3e01d8c1f/d4ae6003-9e4a-445c-b478-c81cfbd1c0ee', creation_time=1756096005808, experiment_id='d4ae6003-9e4a-445c-b478-c81cfbd1c0ee', last_update_time=1756096005808, lifecycle_stage='active', name='diabetes-classification', tags={}>

In [11]:
from sklearn.linear_model import LogisticRegression
    
with mlflow.start_run():
    mlflow.sklearn.autolog()

    model = LogisticRegression(C=1/0.1, solver="liblinear").fit(X_train, y_train)

StatementMeta(, 63446567-947e-42f1-b388-e261690dd809, 35, Finished, Available, Finished)

