In [1]:
import numpy as np
import pandas as pd
from datetime import datetime as dt
import warnings

pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

from sklearn.preprocessing import OneHotEncoder as OHE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn import metrics
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import GridSearchCV

from scipy.sparse import csr_matrix as csr
from scipy.sparse import hstack

In [2]:
df = pd.read_csv('weatherAUS.csv')

In [3]:
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,WNW,20.0,24.0,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,WSW,4.0,22.0,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,WSW,19.0,26.0,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,E,11.0,9.0,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,NW,7.0,20.0,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [4]:
df.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
count,143975.0,144199.0,142199.0,82670.0,75625.0,135197.0,143693.0,142398.0,142806.0,140953.0,130395.0,130432.0,89572.0,86102.0,143693.0,141851.0
mean,12.194034,23.221348,2.360918,5.468232,7.611178,40.03523,14.043426,18.662657,68.880831,51.539116,1017.64994,1015.255889,4.447461,4.50993,16.990631,21.68339
std,6.398495,7.119049,8.47806,4.193704,3.785483,13.607062,8.915375,8.8098,19.029164,20.795902,7.10653,7.037414,2.887159,2.720357,6.488753,6.93665
min,-8.5,-4.8,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,980.5,977.1,0.0,0.0,-7.2,-5.4
25%,7.6,17.9,0.0,2.6,4.8,31.0,7.0,13.0,57.0,37.0,1012.9,1010.4,1.0,2.0,12.3,16.6
50%,12.0,22.6,0.0,4.8,8.4,39.0,13.0,19.0,70.0,52.0,1017.6,1015.2,5.0,5.0,16.7,21.1
75%,16.9,28.2,0.8,7.4,10.6,48.0,19.0,24.0,83.0,66.0,1022.4,1020.0,7.0,7.0,21.6,26.4
max,33.9,48.1,371.0,145.0,14.5,135.0,130.0,87.0,100.0,100.0,1041.0,1039.6,9.0,9.0,40.2,46.7


# Dataset Preparation

In [5]:
def season_finder(x):
    y = dt.strptime(x, "%Y-%m-%d").month
    if y in [12, 1, 2]:
        return 'summer', y
    elif y in [3, 4, 5]:
        return 'autumn', y
    elif y in [6, 7, 8]:
        return 'winter', y
    else:
        return 'spring', y


df['Season'], df['month'] = zip(*df.Date.apply(season_finder))

In [6]:
new_df = pd.DataFrame(columns=df.columns.tolist())

for month in df.month.unique().tolist():
    new_df = pd.concat([new_df, df[df.month == month].fillna(df[df.month == month].mean())])

In [7]:
new_df.isna().sum()

Date                 0
Location             0
MinTemp              0
MaxTemp              0
Rainfall             0
Evaporation          0
Sunshine             0
WindGustDir      10326
WindGustSpeed        0
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am         0
WindSpeed3pm         0
Humidity9am          0
Humidity3pm          0
Pressure9am          0
Pressure3pm          0
Cloud9am             0
Cloud3pm             0
Temp9am              0
Temp3pm              0
RainToday         3261
RainTomorrow      3267
Season               0
month                0
dtype: int64

In [8]:
new_df.dropna(subset=['RainTomorrow', 'RainToday'], inplace=True)

new_df.sort_values(by='Date', ignore_index=True, inplace=True)

for na_col in ['WindGustDir', 'WindDir9am', 'WindDir3pm']:
    new_df[na_col].fillna(method='ffill', inplace=True)
    
new_df.set_index('Date', inplace=True)

In [9]:
new_df.isna().sum()

Location         0
MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
Season           0
month            0
dtype: int64

In [10]:
new_df.replace(['Yes', 'No'], [1, 0], inplace=True)

In [11]:
new_df.head()

Unnamed: 0_level_0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Season,month
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
2007-11-01,Canberra,8.0,24.3,0.0,3.4,6.3,NW,30.0,SW,NW,6.0,20.0,68.0,29.0,1019.7,1015.0,7.0,7.0,14.4,23.6,0,1,spring,11
2007-11-02,Canberra,14.0,26.9,3.6,4.4,9.7,ENE,39.0,E,W,4.0,17.0,80.0,36.0,1012.4,1008.4,5.0,3.0,17.5,25.7,1,1,spring,11
2007-11-03,Canberra,13.7,23.4,3.6,5.8,3.3,NW,85.0,N,NNE,6.0,6.0,82.0,69.0,1009.5,1007.2,8.0,7.0,15.4,20.2,1,1,spring,11
2007-11-04,Canberra,13.3,15.5,39.8,7.2,9.1,NW,54.0,WNW,W,30.0,24.0,62.0,56.0,1005.5,1007.0,2.0,7.0,13.5,14.1,1,1,spring,11
2007-11-05,Canberra,7.6,16.1,2.8,5.6,10.6,SSE,50.0,SSE,ESE,20.0,28.0,68.0,49.0,1018.3,1018.5,7.0,7.0,11.1,15.4,1,0,spring,11


In [12]:
ohe_cols = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'Season', 'month']

prep_df = new_df[[x for x in new_df.columns if x not in ohe_cols]].copy()

ohe_df = new_df[ohe_cols].copy()

In [13]:
ohe = OHE()

ohe_prep = ohe.fit_transform(ohe_df)

In [14]:
target = new_df["RainTomorrow"]
features = hstack([csr(prep_df.drop("RainTomorrow", axis = 1).values), ohe_prep])

In [15]:
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=42)

# Random Forest

In [16]:
rfc = RFC(n_estimators=100, n_jobs=-1)

rfc.fit(X_train, y_train)

RandomForestClassifier(n_jobs=-1)

In [17]:
y_pred = rfc.predict(X_test)

In [18]:
rf_metrics = pd.DataFrame(data={'metric': ['accuracy', 'f1', 'precision', 'recall', 'roc_auc'],
                             'score':[metrics.accuracy_score(y_test, y_pred),
                                      metrics.f1_score(y_test, y_pred),
                                      metrics.precision_score(y_test, y_pred),
                                      metrics.recall_score(y_test, y_pred),
                                      metrics.roc_auc_score(y_test, y_pred)]})

In [19]:
rf_metrics

Unnamed: 0,metric,score
0,accuracy,0.857687
1,f1,0.611736
2,precision,0.78731
3,recall,0.50019
4,roc_auc,0.730577


# Logistic Regression + GridSearch

In [20]:
parameters = {'penalty':('l1', 'l2', 'elasticnet'), 'C':[0.1, 1.0, 10.0, 100.0]}

clf = GridSearchCV(LR(), parameters, n_jobs=-1, scoring='accuracy')
clf.fit(X_train, y_train)

clf.best_params_

{'C': 0.1, 'penalty': 'l2'}

In [21]:
lr = LR(C=0.1, n_jobs=-1)
lr.fit(X_train, y_train)

LogisticRegression(C=0.1, n_jobs=-1)

In [22]:
vfunc = np.vectorize(lambda x: 1 if x > 0.35 else 0)

lr_y_pred = vfunc(lr.predict_proba(X_test)[:, 1])

In [23]:
lr_metrics = pd.DataFrame(data={'metric': ['accuracy', 'f1', 'precision', 'recall', 'roc_auc'],
                             'score':[metrics.accuracy_score(y_test, lr_y_pred),
                                      metrics.f1_score(y_test, lr_y_pred),
                                      metrics.precision_score(y_test, lr_y_pred),
                                      metrics.recall_score(y_test, lr_y_pred),
                                      metrics.roc_auc_score(y_test, lr_y_pred)]})

In [24]:
lr_metrics

Unnamed: 0,metric,score
0,accuracy,0.827002
1,f1,0.614742
2,precision,0.613694
3,recall,0.615794
4,roc_auc,0.751906


# AutoML

In [25]:
import torch

from lightautoml.automl.base import AutoML
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector, ModelBasedImportanceEstimator
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task
from lightautoml.utils.profiler import Profiler
from lightautoml.automl.blend import WeightedBlender

In [26]:
N_THREADS = 4
N_FOLDS = 5
RANDOM_STATE = 42

In [27]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [28]:
task = Task('binary')
reader = PandasToPandasReader(task, cv=N_FOLDS, random_state=RANDOM_STATE)

In [29]:
model0 = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 64, 'seed': 42, 'num_threads': N_THREADS}
)
pipe0 = LGBSimpleFeatures()
mbie = ModelBasedImportanceEstimator()
selector = ImportanceCutoffSelector(pipe0, model0, mbie, cutoff=0)

Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer


In [30]:
pipe = LGBSimpleFeatures()

params_tuner1 = OptunaTuner(n_trials=20, timeout=60)

model1 = BoostLGBM(default_params={'learning_rate': 0.05, 'num_leaves': 128, 'seed': 1, 'num_threads': N_THREADS})

model2 = BoostLGBM(default_params={'learning_rate': 0.025, 'num_leaves': 64, 'seed': 2, 'num_threads': N_THREADS})

pipeline_lvl1 = MLPipeline([(model1, params_tuner1),model2],
                           pre_selection=selector,
                           features_pipeline=pipe,
                           post_selection=None)

In [31]:
pipe1 = LGBSimpleFeatures()

model = BoostLGBM(default_params={'learning_rate': 0.05,
                                  'num_leaves': 64,
                                  'max_bin': 1024,
                                  'seed': 3,
                                  'num_threads': N_THREADS},
                  freeze_defaults=True)

pipeline_lvl2 = MLPipeline([model], pre_selection=None, features_pipeline=pipe1, post_selection=None)

In [32]:
automl = AutoML(reader, [[pipeline_lvl1], [pipeline_lvl2]], skip_conn=False)

In [33]:
# turning our sparce matrix back to data frame

n, m = csr(y_train).get_shape()

train_data = hstack([X_train, csr(y_train).reshape(m, n)])

train_df = pd.DataFrame(data=train_data.toarray())

train_df.rename(columns={x:str(x) for x in train_df.columns if x != 130}, inplace=True)
train_df.rename(columns={130: 'target'}, inplace=True)

In [34]:
autopred = automl.fit_predict(train_df, roles={'target': 'target'})



Layer 1 ...
Train process start. Time left 9999999970.360048 secs
Time left 9999999757.012081
Layer 1 training completed.


Layer 2 ...
Train process start. Time left 9999999757.003166 secs
Time left 9999999752.44733


In [35]:
# turning our sparce matrix back to data frame

test_df = pd.DataFrame(data=X_test.toarray())

test_df.rename(columns={x:str(x) for x in test_df.columns}, inplace=True)

In [36]:
pred = automl.predict(test_df)

In [37]:
automl_pred = vfunc(pred.data[:, 0])

In [38]:
automl_metrics = pd.DataFrame(data={'metric': ['accuracy', 'f1', 'precision', 'recall', 'roc_auc'],
                             'score':[metrics.accuracy_score(y_test, automl_pred),
                                      metrics.f1_score(y_test, automl_pred),
                                      metrics.precision_score(y_test, automl_pred),
                                      metrics.recall_score(y_test, automl_pred),
                                      metrics.roc_auc_score(y_test, automl_pred)]})

In [39]:
automl_metrics

Unnamed: 0,metric,score
0,accuracy,0.86121
1,f1,0.675178
2,precision,0.71007
3,recall,0.643554
4,roc_auc,0.783821
