In [113]:
import os.path

import datasets
import huggingface_hub
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

class WindSpeedDataset:
    INPUT_COL = ["AirNOW_O3", "CMAQ12KM_O3(ppb)", "CMAQ12KM_NO2(ppb)", "CMAQ12KM_CO(ppm)", "PBL(m)"]
    LABEL_COL = "RGRND(W/m2)"

    STRONG_RAD = 0.4
    FEATURES_NUM = len(INPUT_COL)


def convert_to_numpy(dataset):
    df = dataset.to_pandas()
    X = df[WindSpeedDataset.INPUT_COL].to_numpy()
    y = (df[WindSpeedDataset.LABEL_COL] > WindSpeedDataset.STRONG_RAD).to_numpy()
    return X, y



In [1]:
huggingface_hub.notebook_login()

NameError: name 'huggingface_hub' is not defined

In [114]:
ds = datasets.load_dataset("ikkiren/bigdata_ds")
ds["train"] = ds["train"]
ds["test"] = ds["test"]

X_train, y_train = convert_to_numpy(ds["train"])
X_test, y_test = convert_to_numpy(ds["test"])

Using the latest cached version of the dataset since ikkiren/bigdata_ds couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\artfe\.cache\huggingface\datasets\ikkiren___bigdata_ds\default\0.0.0\efe751a400eccc2c3fd63e53a4eb4e306a04ec23 (last modified on Fri Apr 25 16:51:59 2025).


In [115]:
ds

DatasetDict({
    train: Dataset({
        features: ['Latitude_x', 'Longitude_x', 'AirNOW_O3', 'CMAQ12KM_O3(ppb)', 'CMAQ12KM_NO2(ppb)', 'CMAQ12KM_CO(ppm)', 'CMAQ_OC(ug/m3)', 'PRSFC(Pa)', 'PBL(m)', 'TEMP2(K)', 'WDIR10(degree)', 'RGRND(W/m2)', 'CFRAC', 'month', 'day', 'hours', 'month_sine', 'day_sine', 'hours_sine', 'WSPD10(m/s)'],
        num_rows: 7429488
    })
    test: Dataset({
        features: ['Latitude_x', 'Longitude_x', 'AirNOW_O3', 'CMAQ12KM_O3(ppb)', 'CMAQ12KM_NO2(ppb)', 'CMAQ12KM_CO(ppm)', 'CMAQ_OC(ug/m3)', 'PRSFC(Pa)', 'PBL(m)', 'TEMP2(K)', 'WDIR10(degree)', 'RGRND(W/m2)', 'CFRAC', 'month', 'day', 'hours', 'month_sine', 'day_sine', 'hours_sine', 'WSPD10(m/s)'],
        num_rows: 1857372
    })
})

In [116]:
count = np.size(y_train) - np.count_nonzero(y_train)

In [106]:
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features="sqrt",
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)


In [39]:
# лет через 10 зараниться
model = SVC(
    kernel='rbf',
    C=1.0,
    gamma='scale',
    probability=True,
    random_state=42,
    verbose=True
)


model.fit(X_train, y_train)

[LibSVM]

In [110]:
model = make_pipeline(
    PolynomialFeatures(degree=2, interaction_only=False, include_bias=False),
    LinearSVC(
        class_weight='balanced',
        max_iter=10000,
        tol=1e-4,
        random_state=42
    )
)
model.fit(X_train, y_train)

In [117]:
model = GaussianNB()

model.fit(X_train, y_train)

In [118]:
l = 0
def evaluate(X, y, set_name):
    global l
    y_pred = model.predict(X)
    #y_pred_proba = model.predict_proba(X)[:, 1]
    if l == 0:
        l = 1
        print(y_pred)
    acc = accuracy_score(y, y_pred)
    pre = precision_score(y, y_pred)
    rec = recall_score(y, y_pred)

    print(f"{set_name} Accuracy: {acc:.3f}, Precision: {pre:.3f}, Recall: {rec:.3f}")

    subdir = os.path.join("log", set_name)
    os.makedirs(subdir, exist_ok=True)

    with open(os.path.join(subdir, "metrics.csv"), "w") as f:
        f.write("loss,accuracy,precision,recall\n")
        f.write(f"0,{acc},{pre},{rec}\n")

evaluate(X_train, y_train, "train")
evaluate(X_test, y_test, "test")

os.makedirs("checkpoint", exist_ok=True)
import joblib
joblib.dump(model, "checkpoint/rf_model.pkl")

[ True  True False ... False  True  True]
train Accuracy: 0.765, Precision: 0.572, Recall: 0.752
test Accuracy: 0.765, Precision: 0.574, Recall: 0.754


['checkpoint/rf_model.pkl']