In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

: 

In [None]:
rain = pd.read_csv("weatherAUS.csv")

rain.head()

: 

In [None]:
rain.shape

: 

In [None]:
rain.info()

: 

## Work only on today 

In [None]:
cols_to_drop = ["Date", "Location", "RainTomorrow", "Rainfall"]

rain.drop(cols_to_drop, axis=1, inplace=True)

: 

In [None]:
missing_props = rain.isna().mean(axis=0)
missing_props

: 

In [None]:
over_threshold = missing_props[missing_props >= 0.4]
over_threshold

: 

In [None]:
rain.drop(over_threshold.index, 
          axis=1, 
          inplace=True)

: 

In [None]:
X = rain.drop("RainToday", axis=1)
y = rain.RainToday

: 

## Sklearn

In [None]:
categorical_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("oh-encode", OneHotEncoder(handle_unknown="ignore", sparse=False)),
    ]
)

: 

In [None]:
numeric_pipeline = Pipeline(
    steps=[("impute", SimpleImputer(strategy="mean")), 
           ("scale", StandardScaler())]
)

: 

In [None]:
cat_cols = X.select_dtypes(exclude="number").columns
num_cols = X.select_dtypes(include="number").columns

: 

In [None]:
full_processor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipeline, num_cols),
        ("categorical", categorical_pipeline, cat_cols),
    ]
)

: 

In [None]:
import xgboost as xgb

: 

In [None]:
xgb_cl = xgb.XGBClassifier()

: 

In [None]:
print(type(xgb_cl))

: 

In [None]:
# Apply preprocessing
X_processed = full_processor.fit_transform(X)
y_processed = SimpleImputer(strategy="most_frequent").fit_transform(
    y.values.reshape(-1, 1)
)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y_processed, stratify=y_processed, random_state=1121218
)

: 

In [None]:
from sklearn.metrics import accuracy_score

# Init classifier
xgb_cl = xgb.XGBClassifier()

# Fit
xgb_cl.fit(X_train, y_train)

# Predict
preds = xgb_cl.predict(X_test)


: 

In [None]:
accuracy_score(y_test, preds)

: 

: 