In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import PolynomialFeatures
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import cross_val_score

from sklearn.impute import KNNImputer
from pandas.api.types import CategoricalDtype
from geopy.geocoders import Nominatim
import re

import pandas as pd

from scipy import stats

In [2]:
from my_pipelines import *
from kacper_pipelines import *
from pipelines_miki import *

In [3]:
FILE_PATH = './weatherAUS.csv'

In [4]:
australia_rain = pd.read_csv(FILE_PATH)
australia_rain.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [12]:
class WholeRainClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, clf_class = RandomForestClassifier(n_jobs=-1),
                 columns_na_threshold = 12,
                 temp_daily_difference = True,
                 wind_daily_difference = True,
                 pressure_daily_difference = True,
                 humidity_daily_difference = True,
                 latitude_longnitude = True,
                 latitude_longitude_polynomial = 4,
                 imputation = "mean",
                 wind_transformation = "wind_to_degrees",
                 date_features = True
                ):
        self.clf_class = clf_class
        self.columns_na_threshold = 12
        self.temp_daily_difference = temp_daily_difference
        self.wind_daily_difference = wind_daily_difference
        self.pressure_daily_difference = pressure_daily_difference
        self.humidity_daily_difference = humidity_daily_difference
        self.latitude_longnitude = latitude_longnitude
        self.latitude_longitude_polynomial = latitude_longitude_polynomial
        self.imputation = imputation
        self.wind_transformation = wind_transformation
        self.date_features = date_features
    def fit(self, X, y):
        pipeline_candidates = []
        
        if self.latitude_longnitude:
            pipeline_candidates.append(("latitude_longnitude", MapLocation(X, normalize=False)))
            
        if self.wind_transformation == "wind_to_degrees":
            pipeline_candidates.append(("wind_to_degrees",WindToDegrees()))
        elif self.wind_transformation == "wind_to_binary":
            raise Exception("not implemented")
        else:
            pipeline_candidates.append(("drop_wind", DropColumns(["WindGustDir", "WindDir9am", "WindDir3pm"])))
        #remove bad columns
        pipeline_candidates.append(("Drop columns with NAs", DropColumnsAbovePercentNA(12)))
        
        #normalization
        pipeline_candidates.append(("normalization", NormalizeContinuousFeatures(MinMaxScaler())))
        
        if self.imputation == "mean":
            #IMupte
            pipeline_candidates.append(("NA Mean Imputer", MeanNANImputer()))
                
        #custom features
        if self.temp_daily_difference:
            pipeline_candidates.append(("temp_daily_difference", MaxMinTempDifference()))
        if self.wind_daily_difference:
            pipeline_candidates.append(("wind_daily_difference", WindDailyDifference()))
        if self.pressure_daily_difference:
            pipeline_candidates.append(("pressure_daily_difference", PressureDailyDifference()))
        if self.humidity_daily_difference:
            pipeline_candidates.append(("humidity_daily_difference", HumidityDailyDifference()))
        if self.latitude_longnitude and self.latitude_longitude_polynomial:
            pipeline_candidates.append(("latitude_longnitude_polynomial", PolynomialSubset(['longitude', 'latitude'], self.latitude_longitude_polynomial)))
        
        if self.date_features:
            pipeline_candidates.append(("Date", FeaturesFromDate(True)))
        else:
            pipeline_candidates.append(("Drop Date", DropColumns(["Date"])))
        
        
        pipeline_candidates.append(("Drop Rest", DropColumns(["Location", 'Temp3pm','Temp9am','Humidity9am'])))
        #add classifier
        pipeline_candidates.append(("classifier", self.clf_class))
        
        self.pipeline = Pipeline(pipeline_candidates)
        self.pipeline.fit(X,y)
        return self
    def predict(self, X):
        return self.pipeline.predict(X)

In [13]:
class EachCityClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.classifiers = dict()# dict of pipelines
    def fit(self, australia_rain, y):
        main_pipeline_candidates = []
        #drop without RainTommorrow
        main_pipeline_candidates.append(("drop_without_class", DropColumnsWithNAs("RainTomorrow")))
        
        australia_rain_by_city = {k:v for k, v in australia_rain.groupby('Location')}
        
        pass
    def predict(self, X):
        pass

In [14]:
australia_rain = pd.read_csv(FILE_PATH)
print(australia_rain.shape)
australia_rain = DropRowsWithNAInColums(["RainTomorrow"]).transform(australia_rain)
australia_rain = DropRowsWithMoreThanXNA(10).transform(australia_rain)
australia_rain = RainToNumerical().transform(australia_rain)
australia_rain = MeanNANImputer().fit_transform(australia_rain)
australia_rain = RemoveOutliers().fit_transform(australia_rain)

(145460, 23)
Removed: 4560


In [15]:
australia_rain

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,5.468002,7.624739,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.000000,4.503452,16.9,21.8,0.0,0.0
1,2008-12-02,Albury,7.4,25.1,0.0,5.468002,7.624739,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,4.440081,4.503452,17.2,24.3,0.0,0.0
2,2008-12-03,Albury,12.9,25.7,0.0,5.468002,7.624739,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,4.440081,2.000000,21.0,23.2,0.0,0.0
3,2008-12-04,Albury,9.2,28.0,0.0,5.468002,7.624739,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,4.440081,4.503452,18.1,26.5,0.0,0.0
4,2008-12-05,Albury,17.5,32.3,1.0,5.468002,7.624739,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.000000,8.000000,17.8,29.7,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145454,2017-06-20,Uluru,3.5,21.8,0.0,5.468002,7.624739,E,31.0,ESE,...,59.0,27.0,1024.7,1021.2,4.440081,4.503452,9.4,20.9,0.0,0.0
145455,2017-06-21,Uluru,2.8,23.4,0.0,5.468002,7.624739,E,31.0,SE,...,51.0,24.0,1024.6,1020.3,4.440081,4.503452,10.1,22.4,0.0,0.0
145456,2017-06-22,Uluru,3.6,25.3,0.0,5.468002,7.624739,NNW,22.0,SE,...,56.0,21.0,1023.5,1019.1,4.440081,4.503452,10.9,24.5,0.0,0.0
145457,2017-06-23,Uluru,5.4,26.9,0.0,5.468002,7.624739,N,37.0,SE,...,53.0,24.0,1021.0,1016.8,4.440081,4.503452,12.5,26.1,0.0,0.0


In [16]:
y = australia_rain["RainTomorrow"].astype(int)
X = australia_rain.drop("RainTomorrow", axis =1)
print(X.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

(135838, 22)


In [17]:
clf = WholeRainClassifier(date_features = False)

In [18]:
clf.fit(X_train, y_train)
score=clf.score(X_test, y_test)

In [19]:
score

0.8629269729093051

In [None]:
scores = cross_val_score(WholeRainClassifier(date_features = False), X, y, cv=10, scoring="f1")
scores

In [None]:
sum(scores)/len(scores)

In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

In [None]:
# columns_na_threshold = 12,
#                  temp_daily_difference = True,
#                  wind_daily_difference = True,
#                  pressure_daily_difference = True,
#                  humidity_daily_difference = True,
#                  latitude_longnitude = True,
#                  latitude_longitude_polynomial = 4,
#                  imputation = "mean",
#                  wind_transformation = "wind_to_degrees",
#                  date_features = True
#                     XGBClassifier(objective='binary:logistic')

In [None]:
gscv = GridSearchCV(estimator=WholeRainClassifier(),
             param_grid={'clf_class': [RandomForestClassifier,MLPClassifier]},
            n_jobs=-1, cv=2)

In [23]:
best_score = 0
best_clf = 0
for clf_class in [RandomForestClassifier(n_jobs=-1), XGBClassifier(objective='binary:logistic')]:
    print(clf_class)
    clf = WholeRainClassifier(clf_class=clf_class)
    clf.fit(X_train,y_train)
    score = clf.score(X_test, y_test)
    if score > best_score:
        best_clf = clf

RandomForestClassifier(n_jobs=-1)
XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)




ValueError: DataFrame.dtypes for data must be int, float, bool or categorical.  When
                categorical type is supplied, DMatrix parameter
                `enable_categorical` must be set to `True`.Week_Number, Year