In [None]:
import requests
import datetime
import pandas as pd

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric

from joblib import load, dump
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [None]:
files = [('mental_health_dataset.parquet', './data')]

print("Download files:")
for file, path in files:
    url=f"https://www.kaggle.com/datasets/bhadramohit/mental-health-dataset"
    resp=requests.get(url, stream=True)
    save_path=f"{path}/{file}"
    with open(save_path, "wb") as handle:
        for data in tqdm(resp.iter_content(),
                        desc=f"{file}",
                        postfix=f"save to {save_path}",
                        total=int(resp.headers["Content-Length"])):
            handle.write(data)

In [None]:
# Data cleaning function
def clean_data(df):
    # Fill missing values for 'Severity' and 'Consultation_History' with 'Unknown'
    df['Severity'] = df['Severity'].fillna('Unknown')
    df['Consultation_History'] = df['Consultation_History'].fillna('Unknown')

    # Handle missing 'Stress_Level' by filling with 'Unknown'
    df['Stress_Level'] = df['Stress_Level'].fillna('Unknown')

    # Convert categorical columns to string types
    categorical_columns = ['Gender', 'Occupation', 'Country', 'Mental_Health_Condition', 'Severity', 'Consultation_History', 'Stress_Level']
    df[categorical_columns] = df[categorical_columns].astype(str)

    # Convert categorical columns to numerical using LabelEncoder
    le = LabelEncoder()
    for col in categorical_columns:
        df[col] = le.fit_transform(df[col])

    # Handle missing numerical values by filling with the median
    df['Sleep_Hours'] = df['Sleep_Hours'].fillna(df['Sleep_Hours'].median())
    df['Work_Hours'] = df['Work_Hours'].fillna(df['Work_Hours'].median())
    df['Physical_Activity_Hours'] = df['Physical_Activity_Hours'].fillna(df['Physical_Activity_Hours'].median())

    # Convert numerical columns to appropriate types
    df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

    return df

In [None]:
# Load and clean data
df = pd.read_csv('../data/mental_health_dataset.csv')
df_cleaned = clean_data(df)

In [None]:
# Select features and target variable
cat_features = ['Gender', 'Occupation', 'Country', 'Mental_Health_Condition', 'Severity', 'Consultation_History', 'Stress_Level']
num_features = ['Age', 'Sleep_Hours', 'Work_Hours', 'Physical_Activity_Hours']
target = 'Mental_Health_Condition'

In [None]:
df.shape

In [None]:
train_data = df[:5000]
val_data = df[5000:]

In [None]:
model = LinearRegression()

In [None]:
model.fit(train_data[num_features + cat_features], train_data[target])

In [None]:
val_preds = model.predict(val_data[num_features + cat_features])
val_data['prediction'] = val_preds

In [None]:
print(mean_absolute_error(train_data.duration_min, train_data.prediction))
print(mean_absolute_error(val_data.duration_min, val_data.prediction))

In [None]:
with open('models/mental_health_model2.bin', 'wb') as f_out:
    dump(model, f_out)

In [None]:
val_data.to_parquet('data/reference.parquet')

In [None]:
column_mapping = ColumnMapping(
    target=None,
    prediction='prediction',
    numerical_features=num_features,
    categorical_features=cat_features
)

In [None]:
report = Report(metrics=[
    ColumnDriftMetric(column_name='prediction'),
    DatasetDriftMetric(),
    DatasetMissingValuesMetric()
]
)

In [None]:
report.run(reference_data=train_data, current_data=val_data, column_mapping=column_mapping)

In [None]:
report.show(mode='inline')

In [None]:
result = report.as_dict()

In [None]:
result

In [None]:
from evidently.metric_preset import DataDriftPreset, DataQualityPreset

from evidently.ui.workspace import Workspace
from evidently.ui.dashboards import DashboardPanelCounter, DashboardPanelPlot, CounterAgg, PanelValue, PlotType, ReportFilter
from evidently.renderers.html_widgets import WidgetSize

In [None]:
ws = Workspace("workspace")

In [None]:
project = ws.create_project("Mental Health Project")
project.description = "mental health prediction project"
project.save()

In [None]:
regular_report = Report(
    metrics=[
        DataQualityPreset()
    ],
    timestamp=datetime.datetime(2022,1,28)
)

regular_report.run(reference_data=None,
                  current_data=val_data.loc[val_data.age.between('20', '40', inclusive="left")],
                  column_mapping=column_mapping)

regular_report

In [None]:
ws.add_report(project.id, regular_report)

In [None]:
#configure the dashboard
project.dashboard.add_panel(
    DashboardPanelCounter(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        agg=CounterAgg.NONE,
        title="NYC taxi data dashboard"
    )
)

project.dashboard.add_panel(
    DashboardPanelPlot(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        title="Inference Count",
        values=[
            PanelValue(
                metric_id="DatasetSummaryMetric",
                field_path="current.number_of_rows",
                legend="count"
            ),
        ],
        plot_type=PlotType.BAR,
        size=WidgetSize.HALF,
    ),
)

project.dashboard.add_panel(
    DashboardPanelPlot(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        title="Number of Missing Values",
        values=[
            PanelValue(
                metric_id="DatasetSummaryMetric",
                field_path="current.number_of_missing_values",
                legend="count"
            ),
        ],
        plot_type=PlotType.LINE,
        size=WidgetSize.HALF,
    ),
)

project.save()

In [None]:
regular_report = Report(
    metrics=[
        DataQualityPreset()
    ],
    timestamp=datetime.datetime(2022,1,29)
)

regular_report.run(reference_data=None,
                  current_data=val_data.loc[val_data.age.between(20, 40, inclusive="left")],
                  column_mapping=column_mapping)

regular_report

In [None]:
ws.add_report(project.id, regular_report)