In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import PolynomialFeatures
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import cross_val_score

from sklearn.impute import KNNImputer
from pandas.api.types import CategoricalDtype
from geopy.geocoders import Nominatim
import re

import pandas as pd

from scipy import stats

from imblearn.over_sampling import SMOTE
import itertools

from sklearn.metrics import accuracy_score, f1_score

from sklearn.linear_model import LogisticRegression

In [2]:
from my_pipelines import *
from kacper_pipelines import *
from pipelines_miki import *

In [3]:
FILE_PATH = './weatherAUS.csv'

In [4]:
australia_rain = pd.read_csv(FILE_PATH)
australia_rain.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [5]:
class WholeRainClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, clf_class = RandomForestClassifier(n_jobs=-1),
                 columns_na_threshold = 12,
                 temp_daily_difference = True,
                 wind_daily_difference = True,
                 pressure_daily_difference = True,
                 humidity_daily_difference = True,
                 latitude_longnitude = True,
                 imputation = "mean",
                 wind_transformation = "wind_to_degrees",
                 date_features = True
                ):
        self.clf_class = clf_class
        self.columns_na_threshold = 12
        self.temp_daily_difference = temp_daily_difference
        self.wind_daily_difference = wind_daily_difference
        self.pressure_daily_difference = pressure_daily_difference
        self.humidity_daily_difference = humidity_daily_difference
        self.latitude_longnitude = latitude_longnitude
        self.latitude_longitude_polynomial = 4
        self.imputation = imputation
        self.wind_transformation = wind_transformation
        self.date_features = date_features
    def fit(self, X, y):
        pipeline_candidates = []
        
#         if self.latitude_longnitude:
#             pipeline_candidates.append(("latitude_longnitude", MapLocation(X, normalize=False)))
            
#         if self.wind_transformation == "wind_to_degrees":
#             pipeline_candidates.append(("wind_to_degrees",WindToDegrees()))
#         elif self.wind_transformation == "wind_to_binary":
#             raise Exception("not implemented")
#         else:
#             pipeline_candidates.append(("drop_wind", DropColumns(["WindGustDir", "WindDir9am", "WindDir3pm"])))
        #remove bad columns
        #pipeline_candidates.append(("Drop columns with NAs", DropColumnsAbovePercentNA(0)))
        
        #normalization
        pipeline_candidates.append(("normalization", NormalizeContinuousFeatures(MinMaxScaler())))
        
        if self.imputation == "mean":
            #IMupte
            pipeline_candidates.append(("NA Mean Imputer", MeanNANImputer()))
                
        #custom features
        if self.temp_daily_difference:
            pipeline_candidates.append(("temp_daily_difference", MaxMinTempDifference()))
        if self.wind_daily_difference:
            pipeline_candidates.append(("wind_daily_difference", WindDailyDifference()))
        if self.pressure_daily_difference:
            pipeline_candidates.append(("pressure_daily_difference", PressureDailyDifference()))
        if self.humidity_daily_difference:
            pipeline_candidates.append(("humidity_daily_difference", HumidityDailyDifference()))
        if self.latitude_longnitude and self.latitude_longitude_polynomial:
            pipeline_candidates.append(("latitude_longnitude_polynomial", PolynomialSubset(['longitude', 'latitude'], self.latitude_longitude_polynomial)))
        
#         if self.date_features:
#             pipeline_candidates.append(("Date", FeaturesFromDate(True)))
#         else:
#             pipeline_candidates.append(("Drop Date", DropColumns(["Date"])))
        
        
        pipeline_candidates.append(("Drop Rest", DropColumns(['Temp3pm','Temp9am','Humidity9am'])))
        #add classifier
        pipeline_candidates.append(("classifier", self.clf_class))
        
        self.pipeline = Pipeline(pipeline_candidates)
        self.pipeline.fit(X,y)
        return self
    def predict(self, X):
        return self.pipeline.predict(X)

In [6]:
class EachCityClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.classifiers = dict()# dict of pipelines
    def fit(self, australia_rain, y):
        main_pipeline_candidates = []
        #drop without RainTommorrow
        main_pipeline_candidates.append(("drop_without_class", DropColumnsWithNAs("RainTomorrow")))
        
        australia_rain_by_city = {k:v for k, v in australia_rain.groupby('Location')}
        
        pass
    def predict(self, X):
        pass

In [7]:
australia_rain = pd.read_csv(FILE_PATH)
australia_rain = australia_rain.dropna()
print(australia_rain.shape)
australia_rain = DropRowsWithNAInColums(["RainTomorrow"]).transform(australia_rain)
australia_rain = DropRowsWithMoreThanXNA(10).transform(australia_rain)
australia_rain = RainToNumerical().transform(australia_rain)
# australia_rain = MeanNANImputer().fit_transform(australia_rain)
# australia_rain = RemoveOutliers().fit_transform(australia_rain)

(56420, 23)


In [8]:
#Mode
for col in ['RainToday']:
    australia_rain[col]=australia_rain[col].fillna(australia_rain[col].mode()[0])

In [9]:
for col in ['RainToday']:
    australia_rain[col]=australia_rain[col].fillna(australia_rain[col].mode()[0])

In [10]:
australia_rain = WindToDegrees().fit_transform(australia_rain)

In [11]:
australia_rain = MeanNANImputer(['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am','Humidity3pm','Pressure9am','Pressure3pm','Cloud9am','Cloud3pm','Temp9am','Temp3pm', 'WindDir9am', 'WindDir3pm', 'WindGustDir']).fit_transform(australia_rain)

In [12]:
australia_rain = RemoveOutliers().fit_transform(australia_rain)

Removed: 1841


In [13]:
australia_rain = FeaturesFromDate(True).fit_transform(australia_rain)

In [14]:
australia_rain = MapLocation(australia_rain, normalize=False).fit_transform(australia_rain)

In [15]:
australia_rain = DropColumns("Location").fit_transform(australia_rain)

In [16]:
australia_rain.isna().sum()/australia_rain.shape[0] * 100

MinTemp          0.0
latitude         0.0
longitude        0.0
MaxTemp          0.0
Rainfall         0.0
Evaporation      0.0
Sunshine         0.0
WindGustDir      0.0
WindGustSpeed    0.0
WindDir9am       0.0
WindDir3pm       0.0
WindSpeed9am     0.0
WindSpeed3pm     0.0
Humidity9am      0.0
Humidity3pm      0.0
Pressure9am      0.0
Pressure3pm      0.0
Cloud9am         0.0
Cloud3pm         0.0
Temp9am          0.0
Temp3pm          0.0
RainToday        0.0
RainTomorrow     0.0
Week_Number      0.0
Year             0.0
dtype: float64

In [17]:
y = australia_rain["RainTomorrow"].astype(int)
X = australia_rain.drop("RainTomorrow", axis =1)
os = SMOTE()
X, y = os.fit_resample(X,y)
print(X.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

(86266, 24)


In [18]:
clf = WholeRainClassifier()

In [19]:
clf.fit(X_train, y_train)
score=clf.score(X_test, y_test)

In [20]:
score

0.9126738794435858

In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

In [22]:
gscv = GridSearchCV(estimator=WholeRainClassifier(),
             param_grid={'clf_class': [RandomForestClassifier,MLPClassifier]},
            n_jobs=-1, cv=2)

In [23]:
best_score = 0
best_clf = 0
clfs = [RandomForestClassifier(n_jobs=-1),RandomForestClassifier(n_jobs=-1, n_estimators=500), XGBClassifier(objective='binary:logistic'), DecisionTreeClassifier(), LogisticRegression(n_jobs=-1)]
columns_na_threshold = [12]
temp_daily_difference = [True, False]
wind_daily_difference = [False]
pressure_daily_difference = [False]
humidity_daily_difference = [False, True]
latitude_longnitude = [True, False]
imputation = ["mean"]
wind_transformation = ["wind_to_degrees", None]
date_features = [True]
combinations = itertools.product(clfs,
                                 columns_na_threshold,
                                 temp_daily_difference,
                                 wind_daily_difference,
                                 pressure_daily_difference,
                                 humidity_daily_difference,
                                 latitude_longnitude,
                                 imputation,
                                 wind_transformation,
                                 date_features
                                )
for comb in list(combinations):
    print(comb)
    clf = WholeRainClassifier(*comb)

    clf.fit(X_train,y_train)
    predicted = clf.predict(X_test)
    
    score = f1_score(y_test, predicted)
#     score = cross_val_score(clf, X, y, scoring="f1", cv = 5, n_jobs=-1).mean()
    print(score)
    if score > best_score:
        best_clf = clf
        best_score = score

(RandomForestClassifier(n_jobs=-1), 12, True, False, False, False, True, 'mean', 'wind_to_degrees', True)
0.9122150789012273
(RandomForestClassifier(n_jobs=-1), 12, True, False, False, False, True, 'mean', None, True)
0.9097926087634494
(RandomForestClassifier(n_jobs=-1), 12, True, False, False, False, False, 'mean', 'wind_to_degrees', True)
0.9089851475549696
(RandomForestClassifier(n_jobs=-1), 12, True, False, False, False, False, 'mean', None, True)
0.908266480641786
(RandomForestClassifier(n_jobs=-1), 12, True, False, False, True, True, 'mean', 'wind_to_degrees', True)
0.9107629958693789
(RandomForestClassifier(n_jobs=-1), 12, True, False, False, True, True, 'mean', None, True)
0.9118588492466235
(RandomForestClassifier(n_jobs=-1), 12, True, False, False, True, False, 'mean', 'wind_to_degrees', True)
0.9088234155406714
(RandomForestClassifier(n_jobs=-1), 12, True, False, False, True, False, 'mean', None, True)
0.9089781361451388
(RandomForestClassifier(n_jobs=-1), 12, False, False,



0.9127231964128434
(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None), 12, True, False, False, False, True, 'mean', None, True)




0.9127231964128434
(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None), 12, True, False, False, False, False, 'mean', 'wind_to_degrees', True)




0.9125871324413107
(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None), 12, True, False, False, False, False, 'mean', None, True)




0.9125871324413107
(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None), 12, True, False, False, True, True, 'mean', 'wind_to_degrees', True)




0.9125779127377337
(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None), 12, True, False, False, True, True, 'mean', None, True)




0.9125779127377337
(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None), 12, True, False, False, True, False, 'mean', 'wind_to_degrees', True)




0.9135318808714771
(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None), 12, True, False, False, True, False, 'mean', None, True)




0.9135318808714771
(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None), 12, False, False, False, False, True, 'mean', 'wind_to_degrees', True)




0.9140753191659663
(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None), 12, False, False, False, False, True, 'mean', None, True)




0.9140753191659663
(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None), 12, False, False, False, False, False, 'mean', 'wind_to_degrees', True)




0.9132906325060048
(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None), 12, False, False, False, False, False, 'mean', None, True)




0.9132906325060048
(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None), 12, False, False, False, True, True, 'mean', 'wind_to_degrees', True)




0.9133807886107334
(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None), 12, False, False, False, True, True, 'mean', None, True)




0.9133807886107334
(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None), 12, False, False, False, True, False, 'mean', 'wind_to_degrees', True)




0.9144523914172693
(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None), 12, False, False, False, True, False, 'mean', None, True)




0.9144523914172693
(DecisionTreeClassifier(), 12, True, False, False, False, True, 'mean', 'wind_to_degrees', True)
0.8444564671329752
(DecisionTreeClassifier(), 12, True, False, False, False, True, 'mean', None, True)
0.8462967262595775
(DecisionTreeClassifier(), 12, True, False, False, False, False, 'mean', 'wind_to_degrees', True)
0.8452385566450913
(DecisionTreeClassifier(), 12, True, False, False, False, False, 'mean', None, True)
0.8464997286611365
(DecisionTreeClassifier(), 12, True, False, False, True, True, 'mean', 'wind_to_degrees', True)
0.8444153039185437
(DecisionTreeClassifier(), 12, True, False, False, True, True, 'mean', None, True)
0.8432982537296172
(DecisionTreeClassifier(), 12, True, False, False, True, False, 'mean', 'wind_to_degrees', True)
0.8424397426954972
(DecisionTreeClassifier(), 12, True, False, False, True, False, 'mean', None, True)
0.8444875024220113
(DecisionTreeClassifier(), 12, False, False, False, False, True, 'mean', 'wind_to_degrees', True)
0.84587

In [24]:
best_score

0.9144523914172693

In [25]:
best_clf.get_params(deep=False)

{'clf_class': XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
               importance_type='gain', interaction_constraints='',
               learning_rate=0.300000012, max_delta_step=0, max_depth=6,
               min_child_weight=1, missing=nan, monotone_constraints='()',
               n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
               reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
               tree_method='exact', validate_parameters=1, verbosity=None),
 'columns_na_threshold': 12,
 'date_features': True,
 'humidity_daily_difference': True,
 'imputation': 'mean',
 'latitude_longnitude': False,
 'pressure_daily_difference': False,
 'temp_daily_difference': False,
 'wind_daily_difference': False,
 'wind_transformation': 'wind_to_degrees'}