# Краткое описание данных:

    Date - Дата наблюдений
    Location - Название локации, в которой расположена метеорологическая станция
    MinTemp - Минимальная температура в градусах цельсия
    MaxTemp - Максимальная температура в градусах цельсия
    Rainfall - Количество осадков, зафиксированных за день в мм
    Evaporation - Так называемое "pan evaporation" класса А (мм) за 24 часа до 9 утра
    Sunshine - Число солнечных часов за день
    WindGustDir - направление самого сильного порыва ветра за последние 24 часа
    WindGustSpeed - скорость (км / ч) самого сильного порыва ветра за последние 24 часа
    WindDir9am - направление ветра в 9 утра

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('weatherAUS.csv')
print(df.columns)
df.head()

Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RISK_MM', 'RainTomorrow'],
      dtype='object')


Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No


In [3]:
df = df.fillna(0)
df['RainTomorrow'] = df['RainTomorrow'].eq('Yes').mul(1)
df['RainToday'] = df['RainToday'].eq('Yes').mul(1)
df['Date'] = pd.to_datetime(df.Date).dt.strftime('%y-%b-%d')
df['Date'] = pd.DatetimeIndex(df['Date']).month
df.describe()

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
count,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0
mean,6.402544,12.131807,23.174186,2.326738,3.12934,3.988338,37.377592,13.869248,18.292855,67.984915,50.175564,917.357684,915.433845,2.762801,2.695034,16.87951,21.271466,0.221213,2.360682,0.224181
std,3.426506,6.440548,7.194768,8.426426,4.166674,4.688665,16.433198,8.954477,9.07587,20.416069,22.071424,303.403253,302.370443,3.132999,3.050051,6.611499,7.486759,0.415065,8.477969,0.417043
min,1.0,-8.5,-4.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-7.2,-5.4,0.0,0.0,0.0
25%,3.0,7.5,17.9,0.0,0.0,0.0,30.0,7.0,11.0,56.0,35.0,1011.0,1008.5,0.0,0.0,12.2,16.3,0.0,0.0,0.0
50%,6.0,12.0,22.6,0.0,1.6,0.2,37.0,13.0,17.0,70.0,51.0,1016.7,1014.2,1.0,1.0,16.6,20.9,0.0,0.0,0.0
75%,9.0,16.8,28.2,0.6,5.4,8.7,46.0,19.0,24.0,83.0,65.0,1021.8,1019.4,6.0,6.0,21.5,26.3,0.0,0.8,0.0
max,12.0,33.9,48.1,371.0,145.0,14.5,135.0,130.0,87.0,100.0,100.0,1041.0,1039.6,9.0,9.0,40.2,46.7,1.0,371.0,1.0


In [4]:
df['RISK_MM'].value_counts()

0.0      91077
0.2       8762
0.4       3781
0.6       2591
0.8       2055
         ...  
106.0        1
170.4        1
90.0         1
172.2        1
74.4         1
Name: RISK_MM, Length: 681, dtype: int64

In [5]:
df.drop(columns=['RISK_MM'], inplace=True)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['RainTomorrow']), df['RainTomorrow'], test_size=0.25, random_state=42, shuffle=False)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((106644, 22), (35549, 22), (106644,), (35549,))

In [7]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]

        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = np.zeros(X.shape[0])
        return X[self.columns]

In [8]:
location = Pipeline([
            ('selector', ColumnSelector(key='Location')),
            ('Location', OHEEncoder(key='Location'))
        ])

In [9]:
WindGustDir = Pipeline([
            ('selector', ColumnSelector(key='WindGustDir')),
            ('ohe', OHEEncoder(key='WindGustDir'))
        ])

WindDir9am = Pipeline([
            ('selector', ColumnSelector(key='WindDir9am')),
            ('ohe', OHEEncoder(key='WindDir9am'))
        ])

WindDir3pm = Pipeline([
            ('selector', ColumnSelector(key='WindDir3pm')),
            ('ohe', OHEEncoder(key='WindDir3pm'))
        ])

In [10]:
MinTemp =  Pipeline([
                ('selector', NumberSelector(key='MinTemp')),
                ('standard', StandardScaler())
            ])

MaxTemp =  Pipeline([
                ('selector', NumberSelector(key='MaxTemp')),
                ('standard', StandardScaler())
            ])

Temp9am =  Pipeline([
                ('selector', NumberSelector(key='Temp9am')),
                ('standard', StandardScaler())
            ])

Temp3pm =  Pipeline([
                ('selector', NumberSelector(key='Temp3pm')),
                ('standard', StandardScaler())
            ])

Evaporation =  Pipeline([
                ('selector', NumberSelector(key='Evaporation')),
                ('standard', StandardScaler())
            ])

In [11]:
number_features = Pipeline([
                ('selector', ColumnSelector(key=['RainToday', 'Date']))
])

In [12]:
feats = FeatureUnion([#('Location', location),
                      #('WindGustDir', WindGustDir),
                      ('Evaporation', Evaporation),
                      ('WindDir3pm', WindDir3pm),
                      ('MinTemp', MinTemp),
                      ('MaxTemp', MaxTemp),
                      #('Temp9am', Temp9am),
                      #('Temp3pm', Temp3pm),
                      ('continuos_features', number_features)
                     ])
feature_processing = Pipeline([('feats', feats)])

In [13]:
classifiers = [LogisticRegression(), KNeighborsClassifier(n_neighbors=2), BernoulliNB(), GaussianNB()]

for classifier in classifiers:
    pipeline = Pipeline([
        ('features',feats),
        ('classifier', classifier)
    ])
    
    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)

    print(f'classifier: {classifier} accuracy_score : {accuracy_score(y_test, y_pred)}')

classifier: LogisticRegression() accuracy_score : 0.808574080846156
classifier: KNeighborsClassifier(n_neighbors=2) accuracy_score : 0.7779403077442404
classifier: BernoulliNB() accuracy_score : 0.7895299445835325
classifier: GaussianNB() accuracy_score : 0.7430870066668542
