# Data Loading/Fetching

## Installing dependencies

In [1]:
DEPENDENCIES = [
    'tf-slim==1.1.0',
    'numpy==1.21.6',
    'pandas==1.3.5',
    'seaborn',
    'torch==1.11.0+cpu',
    'torchvision==0.12.0+cpu',
    'matplotlib==3.5.3',
    'opencv-python==4.5.4.60',
    'sklearn==0.0.post1',
    'skorch==0.12.1',
    'tqdm',
    'requests',
    'plotly==5.11.0',
]

In [2]:
import subprocess
import typing as tp
import re

def install_dependencies(dependencies: tp.List[str], show_progress: bool = True) -> tp.Tuple[tp.List[str], tp.List[Exception]]:
    emit = print if show_progress else lambda x: None

    resolved_dependencies, errors = [], []
    for dependency in dependencies:
        emit(f'Installing "{dependency}"...')

        try:
            subprocess.run(["pip", "install", "--root-user-action=ignore", dependency], stdout=subprocess.DEVNULL)
            
            if '==' in dependency:
                dependency = re.search('(.+)==.+', dependency).group(1)

            if '@' in dependency:
                dependency = re.search('(.+) @ .+', dependency).group(1)
            
            pip_freeze = subprocess.Popen(("pip", "freeze"), stdout=subprocess.PIPE)
            output = subprocess.check_output(("grep", "-E", f"^({dependency}==)|({dependency} @).+$"), stdin=pip_freeze.stdout)
            resolved_dependencies.append(output.decode().strip())
        except subprocess.CalledProcessError as e:
            errors.append(e)
    
    return resolved_dependencies, errors

In [3]:
install_dependencies(DEPENDENCIES)

Installing "tf-slim==1.1.0"...
Installing "numpy==1.21.6"...
Installing "pandas==1.3.5"...
Installing "seaborn"...
Installing "torch==1.11.0+cpu"...
Installing "torchvision==0.12.0+cpu"...
Installing "matplotlib==3.5.3"...
Installing "opencv-python==4.5.4.60"...
Installing "sklearn==0.0.post1"...
Installing "skorch==0.12.1"...
Installing "tqdm"...
Installing "requests"...
Installing "plotly==5.11.0"...


(['tf-slim==1.1.0',
  'numpy==1.21.6',
  'pandas==1.3.5',
  'seaborn @ file:///home/conda/feedstock_root/build_artifacts/seaborn-split_1629095986539/work',
  'torch==1.11.0+cpu',
  'torchvision==0.12.0+cpu',
  'matplotlib==3.5.3',
  'opencv-python==4.5.4.60',
  'sklearn==0.0.post1',
  'skorch==0.12.1',
  'tqdm @ file:///home/conda/feedstock_root/build_artifacts/tqdm_1649051611147/work',
  'requests @ file:///home/conda/feedstock_root/build_artifacts/requests_1656534056640/work',
  'plotly==5.11.0'],
 [])

In [4]:
from pathlib import Path

BASE_DIR = Path.cwd()
INPUT_DIR = Path("/") / "kaggle" / "input"
DATA_DIR = INPUT_DIR / "planets-dataset" / "planet" / "planet" # https://www.kaggle.com/datasets/nikitarom/planets-dataset

TRAIN_SAMPLES_DIR = DATA_DIR / 'train-jpg'
TRAIN_LABELS_FILE = DATA_DIR / 'train_classes.csv' 

In [5]:
import matplotlib.image
import numpy as np
import numpy.typing as ntp
import cv2
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer 

from tqdm.notebook import tqdm

def load_image(filepath: Path) -> ntp.NDArray[np.int_]:
    return cv2.imread(str(filepath))

def load_data(
    dataset_dir: Path,
    classes_filepath: Path,
    limit: tp.Optional[int] = None,
) -> tp.Tuple[ntp.NDArray[np.float_], ntp.NDArray[np.int_]]:
    df = pd.read_csv(classes_filepath)

    df.tags = np.char.split(df.tags.values.astype(str))

    encoder = MultiLabelBinarizer().fit(df.tags)

    rows = list(df.iterrows())
    if limit is not None:
        rows = rows[:limit]

    progress_bar = tqdm(rows)
    
    X, y = [], []
    for _, row in progress_bar:
        filename = f'{row["image_name"]}.jpg'

        progress_bar.set_description(f'Loading "{filename}"...')

        X.append(load_image(dataset_dir / filename).flatten())
        y.append(encoder.transform([row['tags']]))
        
    return np.vstack(X), np.vstack(y), len(encoder.classes_)

In [None]:
X, y, n_classes = load_data(TRAIN_SAMPLES_DIR, TRAIN_LABELS_FILE, limit=10000)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=128).fit(X_train)

In [None]:
import plotly.graph_objects as go

go.Figure(data=go.Scatter(
    x=np.arange(1, len(pca.explained_variance_ratio_)),
    y=np.cumsum(pca.explained_variance_ratio_),
    stackgroup='one',
    fill='tonexty',
)).update_layout(
    barmode='stack',
    title="Principal Component Analysis",
    xaxis_title="Number of Components",
    yaxis_title="Explained Variance"
).show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.preprocessing import StandardScaler

def create_pipeline(clf: tp.Callable[..., BaseEstimator], n_components: int = 128, **kwargs) -> Pipeline:
    return Pipeline([
        ('pca', PCA(n_components=n_components)),
        ('scaler', StandardScaler()),
        ('model', clf(**kwargs))
    ])

In [None]:
from sklearn.multioutput import MultiOutputClassifier

clf = create_pipeline(MultiOutputClassifier, estimator=HistGradientBoostingClassifier())

In [None]:
clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import f1_score

f1_score(y_val, y_pred, average='micro')