In [1]:
from pprint import pprint
import sys

import modal
import pandas as pd

# Explore Data

## THE FLAT DATA CHALLENGE

- 100,000 records, with 80 data columns: 60 numeric, 20 categorical

In [2]:
df= pd.read_csv("00_original_datasets/flat-training.csv")

In [3]:
df.head()

Unnamed: 0,dog,cat,rabbit,deer,panda,koala,otter,hedgehog,squirrel,dolphin,...,blanket,button,whistle,marble,wagon,storybook,candle,clover,bubble,cookie
0,10,A5DB,,4.46,-2,T2,B9DE,51.8,0,1,...,A8,2,2,0,-0.76,-54,B2,0.38,0,C13
1,10,A5DB,,4.42,0,T3,027A,72.2,1,1,...,A7,16,0,0,-0.76,-48,B0,0.16,0,C15
2,43,027A,8.0,3.11,-1,T0,B9DE,44.2,1,1,...,A4,-25,10,0,-0.88,-39,B0,0.13,0,C1
3,28,63D1,,3.37,-1,T1,027A,41.0,0,1,...,A3,21,9,0,-0.85,-58,B1,0.29,1,C5
4,82,C09E,,3.07,-6,T0,B9DE,46.4,1,1,...,A7,-4,13,0,-0.85,70,B2,0.75,1,C13


# Dataset Analysis Summary

## Column Distribution Patterns

| Column | Type | Unique Values | Missing (%) | Top Value | Top Frequency (%) | Notes |
|--------|------|---------------|-------------|-----------|-------------------|-------|
| dog | Numeric | 81 | 0% | 28 | 9.77% | Skewed distribution |
| cat | Categorical | 29 | 0% | A5DB | 6.63% | Hex-like codes |
| rabbit | Numeric | 100 | 75.98% | 1.0 | 0.45% | **High missing data** |
| deer | Numeric | 300 | 0% | 1.37 | 1.10% | Decimal values |
| panda | Numeric | 7 | 0% | 0 | 34.14% | Range: -6 to 0 |
| koala | Categorical | 4 | 0% | T0 | 25.13% | Even distribution T0-T3 |
| otter | Categorical | 3 | 0% | 027A | 50.22% | Hex codes |
| hedgehog | Numeric | 729 | 0% | 0.0 | 0.99% | **High cardinality** |
| squirrel | Binary | 2 | 0% | 1 | 54.12% | 0/1 values |
| dolphin | Binary | 2 | 0% | 1 | 93.26% | Highly skewed binary |
| penguin | Binary | 2 | 0% | 20 | 89.10% | 10/20 values |
| turtle | Numeric | 65 | 0% | 0.0 | 78.83% | Zero-inflated |
| elephant | Numeric | 29 | 0% | 6 | 6.54% | Range: 1-29 |
| giraffe | Binary | 2 | 0% | y | 98.85% | y/n values |
| lamb | Binary | 2 | 0% | 0 | 58.67% | 0/5 values |
| goat | Categorical | 2 | 0% | 027A | 60.90% | Hex codes |
| cow | Numeric | 83 | 49.99% | 0.43 | 0.67% | **High missing data** |
| horse | Numeric | 28 | 0% | 10 | 6.70% | Range: 1-28 |
| donkey | Categorical | 9 | 0% | X2 | 12.50% | X0-X8 codes |
| pony | Mixed | 2 | 0% | M | 87.94% | 5/M values |
| llama | Categorical | 2 | 0% | B9DE | 51.44% | Hex codes |
| mouse | Numeric | 216 | 0% | 20 | 1.03% | Wide range |
| hamster | Categorical | 12 | 0% | D6 | 8.55% | D0-D11 codes |
| guinea | Numeric | 316 | 0% | 27 | 0.59% | **High cardinality** |
| duck | Categorical | 14 | 0% | 5A6C | 13.09% | Hex codes |
| chicken | Numeric | 85 | 0% | 3.5 | 1.66% | Decimal values |
| sparrow | Binary | 2 | 0% | 0 | 90.00% | 0/120 values |
| parrot | Numeric | 152 | 0% | 4 | 2.02% | Range: 0-151 |
| finch | Binary | 2 | 0% | 0 | 97.71% | Highly skewed binary |
| canary | Numeric | 110 | 0% | 0 | 88.25% | Zero-inflated |
| bee | Numeric | 15 | 0% | 11.5 | 15.46% | Normal-like distribution |
| butterfly | Categorical | 56 | 0% | 5EB2 | 3.27% | Hex codes |
| ladybug | Numeric | 71 | 0% | 5 | 5.26% | Range: 0-70 |
| snail | Numeric | 716 | 0% | 0.602 | 1.06% | **High cardinality** |
| frog | Numeric | 384 | 0% | 16 | 0.46% | **High cardinality** |
| cricket | Numeric | 6 | 0% | 0 | 36.72% | Range: 0-5 |
| tamarin | Categorical | 7 | 0% | Y1 | 15.41% | Y0-Y6 codes |
| wallaby | Categorical | 6 | 0% | 0D45 | 50.53% | Hex codes |
| wombat | Numeric | 258 | 0% | 3 | 1.44% | Range: 3-260 |
| zebra | Binary | 2 | 0% | 0 | 62.39% | 0/1 values |
| flamingo | Numeric | 21 | 0% | 0.1 | 21.63% | Right-skewed |
| peacock | Binary | 2 | 0% | 0 | 92.82% | Highly skewed binary |
| bat | Binary | 2 | 0% | 0 | 94.64% | Highly skewed binary |
| fox | Numeric | 4 | 0% | 5 | 86.34% | Concentrated on 5 |
| beaver | Numeric | 4 | 0% | 0 | 34.32% | Range: -2 to 1 |
| monkey | Categorical | 4 | 0% | Z0 | 26.31% | Z0-Z3 codes |
| seal | Numeric | 66 | 0% | 5.2 | 14.82% | Decimal values |
| robin | Binary | 2 | 0% | 0 | 98.80% | Highly skewed binary |
| loon | Numeric | 288 | 49.88% | 0.23 | 0.99% | **High missing data** |
| swan | Numeric | 3 | 0% | 3 | 78.34% | Range: 2-4 |
| goldfish | Numeric | 208 | 0% | 25 | 1.05% | Range: 6-213 |
| minnow | Binary | 2 | 0% | 0 | 71.80% | 0/1 values |
| mole | Binary | 2 | 0% | 0 | 84.28% | 0/1 values |
| shrew | Numeric | 1162 | 0% | -10.0 | 75.64% | **Highly skewed, high cardinality** |
| puffin | Numeric | 109 | 0% | 0 | 90.27% | Zero-inflated |
| owl | Binary | 2 | 0% | 0 | 94.55% | Highly skewed binary |
| bunny | Numeric | 3 | 0% | 3 | 83.05% | Range: 2-4 |
| bear | Numeric | 11 | 0% | 2.2 | 18.38% | Range: 1.4-2.4 |
| chipmunk | Categorical | 4 | 0% | 027A | 33.96% | Hex codes |
| cub | Numeric | 651 | 0% | 3.0 | 1.31% | **High cardinality** |
| acorn | Numeric | 163 | 0% | 6 | 1.79% | Range: 2-164 |
| leaf | Categorical | 3 | 0% | C09E | 65.07% | Hex codes |
| cloud | Numeric | 21 | 79.40% | -10.0 | 1.18% | **High missing data** |
| rainbow | Numeric | 46 | 0% | 2.4 | 12.01% | Range: 0.6-4.5 |
| puddle | Numeric | 3 | 0% | 2 | 96.21% | Range: 1-3 |
| berry | Numeric | 363 | 0% | 13 | 0.38% | **High cardinality** |
| apple | Numeric | 95 | 0% | -51 | 1.00% | Negative values |
| honey | Numeric | 4 | 0% | 5 | 87.37% | Range: 3-6 |
| pumpkin | Numeric | 196 | 0% | -97 | 1.34% | Negative values |
| teddy | Categorical | 90 | 0% | 248B | 2.94% | Hex codes |
| blanket | Categorical | 10 | 0% | A0 | 11.03% | A0-A9 codes |
| button | Numeric | 75 | 0% | -37 | 1.33% | Negative values |
| whistle | Numeric | 17 | 0% | 13 | 6.67% | Range: -2 to 14 |
| marble | Binary | 2 | 0% | 0 | 90.28% | Highly skewed binary |
| wagon | Numeric | 101 | 0% | -1.13 | 1.05% | Negative decimals |
| storybook | Numeric | 174 | 0% | -86 | 1.40% | Negative values |
| candle | Categorical | 5 | 0% | B3 | 20.32% | B0-B4 codes |
| clover | Numeric | 547 | 0% | 0.04 | 1.32% | **High cardinality** |
| bubble | Binary | 2 | 0% | 0 | 67.71% | 0/1 values |
| cookie | Categorical | 20 | 0% | C12 | 5.96% | C0-C19 codes |

## Summary Statistics
- **Total Columns**: 80
- **Total Rows**: 100,000
- **Binary Columns**: 13
- **High Cardinality (>500 unique)**: 6 columns
- **High Missing Data (>40%)**: 4 columns
- **Categorical with Codes**: 15+ columns

In [6]:
flat_data_challenge = (
    modal.Image.debian_slim(python_version="3.12.10")
    .pip_install("uv", gpu="A10G")
    .run_commands("uv pip install --system --compile-bytecode scipy==1.13.1 'mostlyai[local-gpu]'")
    .add_local_dir("./00_original_datasets", remote_path="/root/data")
    )

In [3]:
app = modal.App("flat-data-challenge")
volume = modal.Volume.from_name("mostlyai-challenge-volume", create_if_missing=True)

In [4]:
@app.function(image=flat_data_challenge)
def hello_mostlyai():
    from mostlyai.sdk import MostlyAI
    mostly = MostlyAI(local=True)
    return (str(mostly.me()), 
            str(mostly.about()),
            str(mostly.models()),
            str(mostly.computes())
        )

In [5]:
with app.run():
    user_info, platform_info, available_models, available_computers = hello_mostlyai.remote()
    print(user_info)
    print("---" * 20)
    print(platform_info)
    print("---" * 20)
    print(available_models)
    print("---" * 20)
    print(available_computers)

id=None name=None first_name=None last_name=None email=None avatar=None settings=None usage=None unread_notifications=None organizations=None
------------------------------------------------------------
version='4.7.9' assistant=False
------------------------------------------------------------
{'TABULAR': ['MOSTLY_AI/Small', 'MOSTLY_AI/Medium', 'MOSTLY_AI/Large'], 'LANGUAGE': ['MOSTLY_AI/LSTMFromScratch-3m', 'microsoft/phi-1_5', '(HuggingFace-hosted models)']}
------------------------------------------------------------
[]


In [6]:
@app.function(image=flat_data_challenge)
def explore_data():
    import pandas as pd
    df = pd.read_csv("/root/data/flat-training.csv")
    return df.shape

In [8]:
with app.run():
    print(f"Total number of rows and columns -> {explore_data.remote()}")

Total number of rows and columns -> (100000, 80)


## The SEQUENTIAL DATA Challenge

- 20,000 groups, with 5-10 records each, with 10 data columns: 7 numeric, 3 categorical

In [8]:
seq_df = pd.read_csv("./00_original_datasets/sequential-training.csv")
seq_df.head()

Unnamed: 0,group_id,alice,david,emily,jacob,james,john,mike,lucas,mary,sarah
0,e5f463e4,V0,4.1,A0,,20.0,X6,-1,1.9,2.7,0.0
1,e5f463e4,V0,4.3,A0,,18.0,X5,-1,1.7,1.5,0.0
2,e5f463e4,V0,4.4,A0,,20.0,X6,-1,1.7,1.2,0.0
3,e5f463e4,V0,4.6,A0,,17.0,X4,-1,1.3,0.8,0.0
4,e5f463e4,V0,4.8,A0,,17.0,X4,-1,1.2,0.4,0.0


In [9]:
seq_df.columns

Index(['group_id', 'alice', 'david', 'emily', 'jacob', 'james', 'john', 'mike',
       'lucas', 'mary', 'sarah'],
      dtype='object')

In [11]:
len(seq_df)

154456