# Demo

In [None]:
!pip install tai-chi-engine

## Probable other dependencies
transformers==4.12.3

In [1]:
from forgebox.imports import *
from tai_chi_engine import TaiChiEngine

## Demo tasks

> Load all the code above in one shot, the demo starts here

In [2]:
def df_creator_image_folder(path: Path) -> pd.DataFrame:
    """
    Create a dataframe ,
    Which list all the image path under a system folder
    """
    path = Path(path)
    files = []
    formats = ["jpg", "jpeg", "png"]
    for fmt in formats:
        files.extend(path.rglob(f"*.{fmt.lower()}"))
        files.extend(path.rglob(f"*.{fmt.upper()}"))
    return pd.DataFrame({"path": files}).sample(frac=1.).reset_index(drop=True)

### Choose dataset

In [3]:
# BEAR_DATASET = HOME/"Downloads"/"bear_dataset"
DATA = Path("/GCI/data")
BEAR_DATASET = DATA/"bear_dataset"
ROTTEN_TOMATOES = DATA/"rttmt"
NETFLIX = DATA/"nf"

Choose one of the following to run 

#### Netflix 📺

In [4]:
base_df = pd.read_csv(NETFLIX/"netflix_titles.csv")
base_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


#### The bear 🐻

In [4]:
base_df = df_creator_image_folder(BEAR_DATASET)
base_df

Unnamed: 0,path
0,/GCI/data/bear_dataset/grizzly/00000137.jpg
1,/GCI/data/bear_dataset/grizzly/00000041.jpg
2,/GCI/data/bear_dataset/grizzly/00000099.jpg
3,/GCI/data/bear_dataset/black/00000058.jpg
4,/GCI/data/bear_dataset/grizzly/00000026.jpeg
...,...
517,/GCI/data/bear_dataset/teddys/00000085.jpg
518,/GCI/data/bear_dataset/teddys/00000009.jpg
519,/GCI/data/bear_dataset/black/00000096.jpg
520,/GCI/data/bear_dataset/grizzly/00000045.jpg


#### The rotten tomatoes 🍅 🎬

In [4]:
# the rotten tomatoes dataset, we are not using every line

base_df = pd.read_csv(ROTTEN_TOMATOES/'critic_reviews.csv', nrows=200000)
base_df = base_df[~base_df['review_score'].isna()].reset_index(drop=True)
base_df = base_df[~base_df['review_content'].isna()].reset_index(drop=True)
base_df = base_df[~base_df['critic_name'].isna()].reset_index(drop=True)

base_df = base_df[base_df['review_score'].apply(lambda x: "/" in x)].reset_index(drop=True)

base_df['review_score'] = base_df['review_score'].apply(eval)

base_df

Unnamed: 0,rotten_tomatoes_link,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_content
0,m/0814255,Ben McEachen,False,Sunday Mail (Australia),Fresh,0.700,2010-02-09,Whether audiences will get behind The Lightnin...
1,m/0814255,Nick Schager,False,Slant Magazine,Rotten,0.250,2010-02-10,Harry Potter knockoffs don't come more transpa...
2,m/0814255,Bill Goodykoontz,True,Arizona Republic,Fresh,0.700,2010-02-10,"Percy Jackson isn't a great movie, but it's a ..."
3,m/0814255,Jim Schembri,True,The Age (Australia),Fresh,0.600,2010-02-10,"Crammed with dragons, set-destroying fights an..."
4,m/0814255,Mark Adams,False,Daily Mirror (UK),Fresh,0.800,2010-02-10,"This action-packed fantasy adventure, based on..."
...,...,...,...,...,...,...,...,...
108237,m/bottle_shock,Phil Villarreal,False,Arizona Daily Star,Rotten,0.500,2008-08-29,"It might have worked better as a documentary, ..."
108238,m/bottle_shock,Todd Gilchrist,False,IGN Movies,Rotten,0.400,2008-08-29,Bottle Shock feels more like an excuse to exer...
108239,m/bottle_shock,Austin Kennedy,False,Sin Magazine,Rotten,0.625,2008-09-02,"I was slightly involved towards the end, but t..."
108240,m/bottle_shock,Sean P. Means,False,Salt Lake Tribune,Rotten,0.500,2008-09-05,"Flat, musty and with a hint of flopsweat."


### Start of the pipeline

Initiate the ```phase``` to track the configuration

In [5]:
PROJECT = Path("./project")
# PROJECT = Path("./project/image_regression")
# PROJECT = Path("./project/rotten1")
# PROJECT = Path("./project/rotten_text")
# PROJECT = Path("./project/netflix")
# PROJECT = Path("./project0")
# PROJECT = Path("./playground")

engine = TaiChiEngine(base_df, project = PROJECT)



In [6]:
engine()

VBox(children=(HBox(children=(Button(button_style='danger', description='1:Enrich', icon='cube', style=ButtonS…

Output()

In [7]:
from tai_chi_tuna.config import PhaseConfig
from tai_chi_tuna.flow.to_enrich import set_enrich, execute_enrich
from tai_chi_tuna.flow.to_quantify import (
    execute_quantify, TaiChiDataset, choose_xy, 
    save_qdict, load_qdict
    )
from tai_chi_tuna.flow.to_model import TaiChiDataModule, assemble_model
from tai_chi_tuna.flow.to_train import (
    make_slug_name, set_trainer, run_training)

In [8]:
from tai_chi_engine import TaiChiEngine

In [9]:
phase = PhaseConfig.load(PROJECT)



In [10]:
base_df = execute_enrich(base_df, phase, enrichments=TaiChiEngine.enrichments_map)
ds = TaiChiDataset(base_df)

  0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
qdict = execute_quantify(df=base_df, phase=phase, quantify_map=TaiChiEngine.quantify_map)
# save quantify objects
_ = save_qdict(phase.project, qdict)

0it [00:00, ?it/s]

In [12]:
datamodule = TaiChiDataModule(ds, qdict)
datamodule.configure(**phase['batch_level'])

In [13]:
module_zoo = {"all_entry": TaiChiEngine.all_entry, "all_exit": TaiChiEngine.all_exit}
final_model = assemble_model(phase, qdict, module_zoo)

In [14]:
phase['task_slug'] = make_slug_name(phase)
phase.save()

In [16]:
run_training(phase, final_model, datamodule)(dict())

HBox(children=(HTML(value="<div class='alert alert-info' role='alert'>\n        <strong>Notice</strong>  Creat…

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
GPU available: True, used: True
TPU available: False, using: 0 TPU cores


HBox(children=(HTML(value="<div class='alert alert-success' role='alert'>\n        <strong>Alert!</strong>  St…

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Output()


  | Name       | Type        | Params
-------------------------------------------
0 | entry_dict | EntryDict   | 11.2 M
1 | exit_part  | CategoryTop | 1.5 K 
-------------------------------------------
11.2 M    Trainable params
0         Non-trainable params
11.2 M    Total params
44.712    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  f'Your {mode}_dataloader has `shuffle=True`, it is best practice to turn'
  f'The dataloader, {name}, does not have many workers which may be a bottleneck.'
  f'The dataloader, {name}, does not have many workers which may be a bottleneck.'


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 14: val_loss reached 0.21179 (best 0.21179), saving model to "/nvme/GCI/lib/tai-chi/nbs/project/checkpoints/epoch=0-val_loss=0.21.ckpt.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 29: val_loss reached 0.19265 (best 0.19265), saving model to "/nvme/GCI/lib/tai-chi/nbs/project/checkpoints/epoch=1-val_loss=0.19.ckpt.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 2, global step 44: val_loss reached 0.15943 (best 0.15943), saving model to "/nvme/GCI/lib/tai-chi/nbs/project/checkpoints/epoch=2-val_loss=0.16.ckpt.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 3, global step 59: val_loss reached 0.11818 (best 0.11818), saving model to "/nvme/GCI/lib/tai-chi/nbs/project/checkpoints/epoch=3-val_loss=0.12.ckpt.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 4, global step 74: val_loss reached 0.10662 (best 0.10662), saving model to "/nvme/GCI/lib/tai-chi/nbs/project/checkpoints/epoch=4-val_loss=0.11.ckpt.ckpt" as top 1


<pytorch_lightning.trainer.trainer.Trainer at 0x7fcd13d77710>

In [1]:
from tai_chi_engine import TaiChiTrained
from pathlib import Path

In [2]:
PROJECT = Path("./project")
trained = TaiChiTrained(PROJECT)



In [3]:
from PIL import Image
img = Image.open("/GCI/data/bear_dataset/grizzly/00000099.jpg").convert('RGB').resize((224,224))

In [5]:
trained.predict({"image":img})

tensor([[ 5.8866, -5.5020, -3.5725]])

In [9]:
trained.phase

PhaseConfig:{
  "enrich": [
    {
      "src": "path",
      "dst": "image",
      "kwargs": {
        "convert": "RGB",
        "size": 224
      },
      "enrich": "EnrichImage"
    },
    {
      "src": "path",
      "dst": "label",
      "kwargs": {},
      "enrich": "ParentAsLabel"
    }
  ],
  "quantify": [
    {
      "src": "image",
      "x": true,
      "kwargs": {
        "mean_": "imagenet",
        "std_": "imagenet"
      },
      "quantify": "QuantifyImage"
    },
    {
      "src": "label",
      "x": false,
      "kwargs": {
        "min_frequency": 1
      },
      "quantify": "QuantifyCategory"
    }
  ],
  "batch_level": {
    "valid_ratio": 0.1,
    "batch_size": 32,
    "shuffle": true,
    "num_workers": 0
  },
  "x_models": {
    "image": {
      "model_name": "ImageConvEncoder",
      "src": "image",
      "kwargs": {
        "name": "resnet18"
      }
    }
  },
  "y_models": {
    "label": {
      "model_name": "CategoryTop",
      "src": "label",
      "kwar