# LF AI & Data Foundation - Tool Exploratory Data Analysis

Exploring tools which are part of or associated with the LF AI & Data Foundation.

In [3]:
from datetime import datetime

import pandas as pd
import pytz
import requests
import yaml

utc = pytz.UTC

In [4]:
# gather data
link = (
    "https://raw.githubusercontent.com/lfai/lfai-landscape/main/processed_landscape.yml"
)
raw_content = requests.get(link)
len(raw_content.text)

2472571

In [5]:
# load yaml as dict
dict_content = yaml.safe_load(raw_content.text)
dict_content.keys()

dict_keys(['landscape', 'twitter_options', 'updated_at'])

In [117]:
# flatten and store data in pandas df
df = pd.json_normalize(
    data=dict_content["landscape"],
    record_path=["subcategories", "items"],
    meta=[["category", "name"]],
).drop("item", axis=1)
df.head()

Unnamed: 0,name,homepage_url,repo_url,logo,crunchbase,repos,crunchbase_data.name,crunchbase_data.description,crunchbase_data.num_employees_min,crunchbase_data.num_employees_max,...,crunchbase_data.kind,yahoo_finance_data.market_cap,yahoo_finance_data.effective_ticker,twitter_data.latest_tweet_date,project,description,stock_ticker,url_for_bestpractices,organization.name,category.name
0,Accord.NET,http://accord-framework.net/,https://github.com/accord-net/framework,accord-net.svg,https://www.crunchbase.com/organization/accord...,[{'url': 'https://github.com/accord-net/framew...,Accord.NET Framework,Machine Learning Framework,,,...,,,,NaT,,,,,,Framework
1,LightGBM,https://github.com/Microsoft/LightGBM,https://github.com/Microsoft/LightGBM,lightgbm.svg,https://www.crunchbase.com/organization/microsoft,[{'url': 'https://github.com/Microsoft/LightGB...,Microsoft,Microsoft is a software corporation that devel...,10001.0,1000000.0,...,market_cap,2015598000000.0,MSFT,NaT,,,,,,Framework
2,Mahout,https://mahout.apache.org/,https://github.com/apache/mahout,mahout.svg,https://www.crunchbase.com/organization/apache,"[{'url': 'https://github.com/apache/mahout', '...",The Apache Software Foundation,Apache Software Foundation provides organizati...,1.0,10.0,...,,,,2020-10-16 16:57:06+00:00,,,,,,Framework
3,ML.net,https://dotnet.microsoft.com/en-us/apps/machin...,https://github.com/dotnet/machinelearning,ml-net.svg,https://www.crunchbase.com/organization/microsoft,[{'url': 'https://github.com/dotnet/machinelea...,Microsoft,Microsoft is a software corporation that devel...,10001.0,1000000.0,...,market_cap,2015598000000.0,MSFT,NaT,,,,,,Framework
4,Ray,https://bair.berkeley.edu/blog/2018/01/09/ray/,https://github.com/ray-project/ray,ray.svg,https://www.crunchbase.com/organization/univer...,"[{'url': 'https://github.com/ray-project/ray',...",University of California Berkeley,"University of California, Berkeley is a public...",10001.0,1000000.0,...,funding,,,2022-05-10 23:49:42+00:00,,,,,,Framework


In [128]:
# gather days since initial commit
df["github_start_commit_data.start_date"] = pd.to_datetime(
    df["github_start_commit_data.start_date"]
)
df["days_since_first_commit"] = (
    (df["github_start_commit_data.start_date"] - pd.to_datetime(datetime.now(tz=utc)))
    .abs()
    .astype("timedelta64[D]")
)
df["days_since_first_commit"].head()

0    3686.0
1    2106.0
2    5231.0
3    1469.0
4    2285.0
Name: days_since_first_commit, dtype: float64

In [122]:
df["github_start_commit_data.start_date"]

0     2012-04-08 14:05:58+00:00
1     2016-08-05 06:06:01+00:00
2     2008-01-15 03:13:56+00:00
3     2018-05-04 00:22:00+00:00
4     2016-02-07 22:18:40+00:00
                 ...           
394                         NaT
395                         NaT
396                         NaT
397                         NaT
398                         NaT
Name: github_start_commit_data.start_date, Length: 399, dtype: datetime64[ns, UTC]

In [66]:
df["category.name"].value_counts()

Hosting Companies              27
Library                        25
Natural Language Processing    25
Associate                      22
Tool                           21
Platform                       21
Store & Format                 19
Framework                      18
Workflow                       17
Programming                    16
General                        16
Computing & Management         15
Visualization                  15
Stream Processing              14
Notebook Environment           11
Premier                         9
Explainability                  9
Reinforcement Learning          9
Operations                      9
Inference                       7
Labeling & Annotation           7
Pipeline Management             7
Federated Learning              5
Training                        5
Security & Privacy              5
Bias & Fairness                 5
SQL Engine                      5
Adversarial                     5
Relational DB                   4
Feature Engine

In [130]:
# show sorted workflow tools
df[df["category.name"] == "Workflow"].sort_values(
    ["github_data.stars", "github_data.contributors_count", "days_since_first_commit"],
    ascending=False,
)[
    [
        "name",
        "homepage_url",
        "crunchbase_data.name",
        "github_data.stars",
        "github_data.contributors_count",
        "days_since_first_commit",
    ]
]

Unnamed: 0,name,homepage_url,crunchbase_data.name,github_data.stars,github_data.contributors_count,days_since_first_commit
222,Apache Airflow,https://airflow.apache.org/,The Apache Software Foundation,25845.0,2391.0,2774.0
234,Luigi,https://github.com/spotify/luigi,Spotify,15660.0,587.0,3828.0
224,Argo,https://argoproj.github.io/,Intuit,11009.0,581.0,1667.0
236,Prefect,https://docs.prefect.io/,Prefect,8831.0,323.0,1875.0
232,Kedro,https://kedro.readthedocs.io/en/stable,LF AI & Data Foundation,7189.0,150.0,1092.0
227,Cadence,https://cadenceworkflow.io/,Uber,5977.0,104.0,2025.0
225,Azkaban,https://azkaban.github.io,LinkedIn,4034.0,144.0,3661.0
226,BentoML,https://www.bentoml.com,BentoML.ai,3474.0,105.0,1136.0
237,TRAINS,https://github.com/allegroai/clearml,Allegro.AI,3166.0,49.0,1067.0
223,Apache Nifi,https://nifi.apache.org,The Apache Software Foundation,3126.0,522.0,2711.0


In [None]:
# show sorted vizualization tools
df[df["category.name"] == "Visualization"].sort_values(
    ["github_data.stars", "github_data.contributors_count", "days_since_first_commit"],
    ascending=False,
)[
    [
        "name",
        "homepage_url",
        "crunchbase_data.name",
        "github_data.stars",
        "github_data.contributors_count",
        "days_since_first_commit",
    ]
]