<a href="https://colab.research.google.com/github/Aeim/MadeWithML/blob/main/test_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Testing ML

## Data

In [None]:
!pip install great-expectations==0.15.15

In [2]:
import great_expectations as ge
import json
import pandas as pd
from urllib.request import urlopen

In [3]:
# Load labeled projects
projects = pd.read_csv("https://raw.githubusercontent.com/GokuMohandas/Made-With-ML/main/datasets/projects.csv")
tags = pd.read_csv("https://raw.githubusercontent.com/GokuMohandas/Made-With-ML/main/datasets/tags.csv")
df = ge.dataset.PandasDataset(pd.merge(projects, tags, on="id"))
print(f"{len(df)} projects")
df.head(5)

955 projects


Unnamed: 0,id,created_on,title,description,tag
0,6,2020-02-20 06:43:18,Comparison between YOLO and RCNN on real world...,Bringing theory to experiment is cool. We can ...,computer-vision
1,7,2020-02-20 06:47:21,"Show, Infer & Tell: Contextual Inference for C...",The beauty of the work lies in the way it arch...,computer-vision
2,9,2020-02-24 16:24:45,Awesome Graph Classification,"A collection of important graph embedding, cla...",graph-learning
3,15,2020-02-28 23:55:26,Awesome Monte Carlo Tree Search,A curated list of Monte Carlo tree search pape...,reinforcement-learning
4,19,2020-03-03 13:54:31,Diffusion to Vector,Reference implementation of Diffusion2Vec (Com...,graph-learning


### Expectations

In [None]:
# Presence of specific features
df.expect_table_columns_to_match_ordered_list(
    column_list=["id", "created_on", "title", "description", "tag"]
)

In [None]:
# Unique combinations of features (detect data leaks!)
df.expect_compound_columns_to_be_unique(column_list=["title", "description"])

In [None]:
# Missing values
df.expect_column_values_to_not_be_null(column="tag")

In [None]:
# Unique values
df.expect_column_values_to_be_unique(column="id")

In [None]:
# Type adherence
df.expect_column_values_to_be_of_type(column="title", type_="str")

In [None]:
# List (categorical) / range (continuous) of allowed values
tags = ["computer-vision", "graph-learning", "reinforcement-learning",
        "natural-language-processing", "mlops", "time-series"]
df.expect_column_values_to_be_in_set(column="tag", value_set=tags)

### Organization

In [None]:
# Table expectations
# columns
df.expect_table_columns_to_match_ordered_list(
    column_list=["id", "created_on", "title", "description", "tag"])

# data leak
df.expect_compound_columns_to_be_unique(column_list=["title", "description"])

In [None]:
# Column expectations
# id
df.expect_column_values_to_be_unique(column="id")

# created_on
df.expect_column_values_to_not_be_null(column="created_on")
df.expect_column_values_to_match_strftime_format(
    column="created_on", strftime_format="%Y-%m-%d %H:%M:%S")

# title
df.expect_column_values_to_not_be_null(column="title")
df.expect_column_values_to_be_of_type(column="title", type_="str")

# description
df.expect_column_values_to_not_be_null(column="description")
df.expect_column_values_to_be_of_type(column="description", type_="str")

# tag
df.expect_column_values_to_not_be_null(column="tag")
df.expect_column_values_to_be_of_type(column="tag", type_="str")

In [None]:
# Expectation suite
expectation_suite = df.get_expectation_suite(discard_failed_expectations=False)
print(df.validate(expectation_suite=expectation_suite, only_return_failures=True))

### Projects
### Checkpoints
### Documentation
### Production

## Models 🤖

### Training

`assert model(inputs).shape == torch.Size([len(inputs), num_classes])`

`assert epoch_loss < prev_epoch_loss`

```
accuracy = train(model, inputs=batches[0])
assert accuracy == pytest.approx(0.95, abs=0.05) # 0.95 +- 0.05
```
```
train(model)
assert learning_rate >= min_learning_rate
assert artifacts
```
```
assert train(model, device=torch.device("cpu"))
assert train(model, device=torch.device("cuda"))
```






### Behavioral testing

```
# INVariance via verb injection (changes should not affect outputs)
tokens = ["revolutionized", "disrupted"]
texts = [f"Transformers applied to NLP have {token} the ML field." for token in tokens]
predict.predict(texts=texts, artifacts=artifacts)

# DIRectional expectations (changes with known outputs)
tokens = ["text classification", "image classification"]
texts = [f"ML applied to {token}." for token in tokens]
predict.predict(texts=texts, artifacts=artifacts)

# Minimum Functionality Tests (simple input/output pairs)
tokens = ["natural language processing", "mlops"]
texts = [f"{token} is the next big wave in machine learning." for token in tokens]
predict.predict(texts=texts, artifacts=artifacts)
```



### Inference