In [74]:
import pandas as pd
import numpy as np
import joblib

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [75]:
raw_df = pd.read_csv('weatherAUS.csv')
raw_df.dropna(subset=['RainToday', 'RainTomorrow'], inplace=True)

year = pd.to_datetime(raw_df.Date).dt.year

train_df = raw_df[year < 2015]
val_df = raw_df[year == 2015]
test_df = raw_df[year > 2015]

input_cols = list(train_df.columns)[1:-1]
target_col = 'RainTomorrow'

train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_col].copy()
val_inputs = val_df[input_cols].copy()
val_targets = val_df[target_col].copy()
test_inputs = test_df[input_cols].copy()
test_targets = test_df[target_col].copy()

numeric_cols = train_inputs.select_dtypes(include=np.number).columns.tolist()
categorical_cols = train_inputs.select_dtypes('object').columns.tolist()

In [76]:
imputer = SimpleImputer(strategy = 'mean')
imputer.fit(train_inputs[numeric_cols])

train_inputs[numeric_cols] = imputer.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = imputer.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = imputer.transform(test_inputs[numeric_cols])

In [77]:
min_values = train_inputs[numeric_cols].min()
max_values = train_inputs[numeric_cols].max()
mean_values = train_inputs[numeric_cols].mean()

In [78]:
scaler = MinMaxScaler()
scaler.fit(train_inputs[numeric_cols])

train_inputs[numeric_cols] = scaler.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = scaler.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])

In [79]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoder.fit(train_inputs[categorical_cols])
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))

In [80]:
train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
val_inputs[encoded_cols] = encoder.transform(val_inputs[categorical_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])

  train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
  train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
  train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
  train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
  train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
  train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
  train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
  train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
  train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
  train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
  train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
  train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
  train_inputs[encoded_cols]

In [81]:
X_train = train_inputs[numeric_cols + encoded_cols]
X_val = val_inputs[numeric_cols + encoded_cols]
X_test = test_inputs[numeric_cols + encoded_cols]

In [82]:
y_train = train_targets
y_val = val_targets
y_test = test_targets

In [83]:
model = RandomForestClassifier(n_estimators=100, random_state=42)

In [84]:
model.fit(X_train, train_targets)

In [85]:
y_pred_val = model.predict(X_val)

accuracy = accuracy_score(y_val, y_pred_val)
print(f"Точність на валідаційному датасеті: {accuracy:.2f}")

Точність на валідаційному датасеті: 0.86


In [86]:
y_pred_test = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_test)
print(f"Точність на тестовому датасеті: {accuracy:.2f}")

Точність на тестовому датасеті: 0.85


In [87]:
test_probs = model.predict_proba(X_test)

In [88]:
test_probs

array([[0.75, 0.25],
       [0.68, 0.32],
       [0.67, 0.33],
       ...,
       [0.98, 0.02],
       [1.  , 0.  ],
       [0.99, 0.01]])

In [89]:
aussie_rain = {
    'model': model,
    'imputer': imputer,
    'scaler': scaler,
    'encoder': encoder,
    'numeric_cols': numeric_cols,
    'categorical_cols': categorical_cols,
    'encoded_cols': encoded_cols,
    'min_values': min_values,
    'max_values': max_values,
    'mean_values': mean_values
}

In [90]:
joblib.dump(aussie_rain, "rf_model.joblib")

['rf_model.joblib']