In [3]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:

import cv2
from datetime import timedelta
import matplotlib.pyplot as plt
import numpy as np
import odc.stac
import pandas as pd
from pathlib import Path
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

%matplotlib inline

In [5]:
DATA_DIR = Path.cwd().parent.resolve() / "data"
BENCHMARK_DATA_DIR = DATA_DIR / "benchmark"

# save image arrays in case we want to generate more features
IMAGE_ARRAY_DIR = BENCHMARK_DATA_DIR / "image_arrays"
IMAGE_ARRAY_DIR.mkdir(exist_ok=True, parents=True)
BENCHMARK_DATA_DIR

PosixPath('/home/alenaastrakhantseva/PycharmProjects/tick_tick_bloom/data/benchmark')

In [6]:
train_labels = pd.read_csv(DATA_DIR / "train_labels.csv")
train_labels.head()

Unnamed: 0,uid,region,severity,density
0,aabm,midwest,1,585.0
1,aacd,south,1,290.0
2,aaee,south,1,1614.0
3,aaff,midwest,3,111825.0
4,aafl,midwest,4,2017313.0


In [13]:
image_features = pd.read_csv(BENCHMARK_DATA_DIR / "image_features.csv", index_col=0)
image_features.head()

Unnamed: 0,red_average,green_average,blue_average,red_median,green_median,blue_median
umac,26.305195,44.17316,28.357143,25.0,34.0,27.0
egox,0.0,0.0,0.0,0.0,0.0,0.0
havx,0.0,0.0,0.0,0.0,0.0,0.0
laoq,0.0,0.0,0.0,0.0,0.0,0.0
ttsk,24.071429,41.266234,21.489177,23.0,40.0,21.0


## Split the data

In [17]:
# bring together train labels and features into one dataframe
# this ensures the features array and labels array will be in same order
train_data = train_labels.merge(
    image_features, how="inner", left_on="uid", right_index=True, validate="1:1"
)

# split into train and validation
rng = np.random.RandomState(30)
train_data["split"] = rng.choice(
    ["train", "validation"], size=len(train_data), replace=True, p=[0.67, 0.33]
)

train_data.head()

Unnamed: 0,uid,region,severity,density,red_average,green_average,blue_average,red_median,green_median,blue_median,split
2818,egox,south,2,29046.0,0.0,0.0,0.0,0.0,0.0,0.0,train
3558,fknr,west,1,173.790293,255.0,255.0,255.0,255.0,255.0,255.0,train
4610,havx,south,1,94.0,0.0,0.0,0.0,0.0,0.0,0.0,train
5931,jbjj,south,1,3870.0,255.0,255.0,255.0,255.0,255.0,255.0,train
7309,laoq,south,1,2179.0,0.0,0.0,0.0,0.0,0.0,0.0,validation


In [18]:
# separate features and labels, and train and validation
feature_cols = [
    "red_average",
    "green_average",
    "blue_average",
    "red_median",
    "green_median",
    "blue_median",
]
target_col = "severity"

val_set_mask = train_data.split == "validation"
X_train = train_data.loc[~val_set_mask, feature_cols].values
y_train = train_data.loc[~val_set_mask, target_col]
X_val = train_data.loc[val_set_mask, feature_cols].values
y_val = train_data.loc[val_set_mask, target_col]

# flatten label data into 1-d arrays
y_train = y_train.values.flatten()
y_val = y_val.values.flatten()

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((8, 6), (2, 6), (8,), (2,))

In [19]:
# see an example of what the data looks like
print("X_train[0]:", X_train[0])
print("y_train[:10]:", y_train[:10])

X_train[0]: [0. 0. 0. 0. 0. 0.]
y_train[:10]: [2 1 1 1 4 1 4 1]


## Build LightGBM Model

In [20]:
# save out features
x_train_pth = BENCHMARK_DATA_DIR / "x_train.npy"
x_train_pth.parent.mkdir(exist_ok=True, parents=True)

with open(x_train_pth, "wb") as f:
    np.save(f, X_train)

# save out labels
y_train_pth = BENCHMARK_DATA_DIR / "y_train.npy"

with open(y_train_pth, "wb") as f:
    np.save(f, y_train)

In [24]:
import lightgbm as lgb

import joblib
import numpy as np
from pathlib import Path
from loguru import logger
import typer

DATA_DIR = Path.cwd().parent / "data/benchmark"



features_path=DATA_DIR / "x_train.npy"
labels_path=DATA_DIR / "y_train.npy"
model_save_path=DATA_DIR / "lgb_classifier.txt"

"""
Train a LightGBM model based on training features in features_path and
training labels in labels_path. Save our the trained model to model_save_path
"""

# load saved features and labels
with open(features_path, "rb") as f:
    X_train = np.load(f)
with open(labels_path, "rb") as f:
    y_train = np.load(f)

logger.info(f"Loaded training features of shape {X_train.shape} from {features_path}")
logger.info(f"Loading training labels of shape {y_train.shape} from {labels_path}")

# instantiate tree model
model = lgb.LGBMClassifier(random_state=10)

# fit model
logger.info("Fitting LGBM model")
model.fit(X_train, y_train)
print(model)

# save out model weights
joblib.dump(model, str(model_save_path))
logger.success(f"Model weights saved to {model_save_path}")


2022-12-28 21:48:18.537 | INFO     | __main__:<module>:28 - Loaded training features of shape (8, 6) from /home/alenaastrakhantseva/PycharmProjects/tick_tick_bloom/data/benchmark/x_train.npy
2022-12-28 21:48:18.537 | INFO     | __main__:<module>:29 - Loading training labels of shape (8,) from /home/alenaastrakhantseva/PycharmProjects/tick_tick_bloom/data/benchmark/y_train.npy
2022-12-28 21:48:18.538 | INFO     | __main__:<module>:35 - Fitting LGBM model
2022-12-28 21:48:18.562 | SUCCESS  | __main__:<module>:41 - Model weights saved to /home/alenaastrakhantseva/PycharmProjects/tick_tick_bloom/data/benchmark/lgb_classifier.txt


LGBMClassifier(random_state=10)


## Validation

In [25]:
# save out validation features
x_val_pth = BENCHMARK_DATA_DIR / "x_val.npy"
x_val_pth.parent.mkdir(exist_ok=True, parents=True)

with open(x_val_pth, "wb") as f:
    np.save(f, X_val)

# save out validation labels
y_val_pth = BENCHMARK_DATA_DIR / "y_val.npy"

with open(y_val_pth, "wb") as f:
    np.save(f, y_val)

In [26]:
import lightgbm as lgb

import joblib
from loguru import logger
import numpy as np
from pathlib import Path
import typer

DATA_DIR = Path.cwd().parent / "data/benchmark"


model_weights_path=DATA_DIR / "lgb_classifier.txt"
features_path=DATA_DIR / "x_val.npy"
preds_save_path=DATA_DIR / "val_preds.npy"

"""
Generate predictions with a LightGBM model using weights saved at model_weights_path
and features saved at features_path. Save out predictions to preds_save_path.
"""
# load model weights
lgb_model = joblib.load(model_weights_path)
logger.info(f"Loaded model {lgb_model} from {model_weights_path}")

# load the features
with open(features_path, "rb") as f:
    X_val = np.load(f)
logger.info(f"Loaded features of shape {X_val.shape} from {features_path}")

# generate predictions
preds = lgb_model.predict(X_val)

# save out predictions
with open(preds_save_path, "wb") as f:
    np.save(f, preds)
logger.success(f"Predictions saved to {preds_save_path}")

2022-12-28 21:50:17.207 | INFO     | __main__:<module>:22 - Loaded model LGBMClassifier(random_state=10) from /home/alenaastrakhantseva/PycharmProjects/tick_tick_bloom/data/benchmark/lgb_classifier.txt
2022-12-28 21:50:17.208 | INFO     | __main__:<module>:27 - Loaded features of shape (2, 6) from /home/alenaastrakhantseva/PycharmProjects/tick_tick_bloom/data/benchmark/x_val.npy
2022-12-28 21:50:17.220 | SUCCESS  | __main__:<module>:35 - Predictions saved to /home/alenaastrakhantseva/PycharmProjects/tick_tick_bloom/data/benchmark/val_preds.npy


In [27]:
preds_pth = BENCHMARK_DATA_DIR / "val_preds.npy"
with open(preds_pth, "rb") as f:
    val_preds = np.load(f)

In [28]:
val_preds[:10]

array([1, 1])

In [29]:
pd.Series(val_preds).value_counts().sort_index()

1    2
dtype: int64

In [30]:
# get the validation part of the training data
val_set = train_data[train_data.split == "validation"][
    ["uid", "region", "severity"]
].copy()
val_set["pred"] = val_preds

val_set.head()

Unnamed: 0,uid,region,severity,pred
7309,laoq,south,1,1
13003,ttsk,south,1,1


In [31]:
region_scores = []
for region in val_set.region.unique():
    sub = val_set[val_set.region == region]
    region_rmse = mean_squared_error(sub.severity, sub.pred, squared=False)
    print(f"RMSE for {region} (n={len(sub)}): {round(region_rmse, 4)}")
    region_scores.append(region_rmse)

overall_rmse = np.mean(region_scores)
print(f"Final score: {overall_rmse}")

RMSE for south (n=2): 0.0
Final score: 0.0


In [32]:
# what's our RMSE across all validation data points?
mean_squared_error(y_val, val_preds, squared=False)

0.0

In [33]:
# how many times did each severity level show up in our predictions vs. the actual values?
val_results = pd.DataFrame({"pred": val_preds, "actual": y_val})

pd.concat(
    [
        val_results.pred.value_counts().sort_index().rename("predicted"),
        val_results.actual.value_counts().sort_index().rename("actual"),
    ],
    axis=1,
).rename_axis("severity_level_count")

Unnamed: 0_level_0,predicted,actual
severity_level_count,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2,2
