In [1]:
import numpy as np
import pandas as pd
import datetime    
import torch
import torch.nn as nn
import torch.utils.data.dataloader
from neural_net import P1_Net, do_train

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.utils.validation import column_or_1d

In [2]:
df_city = pd.read_csv("city_attributes.csv")
df_humidity = pd.read_csv("humidity.csv")
df_pressure = pd.read_csv("pressure.csv")
df_temper = pd.read_csv("temperature.csv")
df_descript = pd.read_csv("weather_description.csv")
df_direction = pd.read_csv("wind_direction.csv")
df_speed = pd.read_csv("wind_speed.csv")

In [3]:
df_temper.describe(include='all')

Unnamed: 0,datetime,Vancouver,Portland,San Francisco,Seattle,Los Angeles,San Diego,Las Vegas,Phoenix,Albuquerque,...,Philadelphia,New York,Montreal,Boston,Beersheba,Tel Aviv District,Eilat,Haifa,Nahariyya,Jerusalem
count,45253,44458.0,45252.0,44460.0,45250.0,45250.0,45252.0,45252.0,45250.0,45252.0,...,45250.0,44460.0,45250.0,45250.0,44455.0,44460.0,44461.0,44455.0,44456.0,44460.0
unique,45253,,,,,,,,,,...,,,,,,,,,,
top,2017-11-30 00:00:00,,,,,,,,,,...,,,,,,,,,,
freq,1,,,,,,,,,,...,,,,,,,,,,
mean,,283.862654,284.992929,288.155821,284.409626,290.846116,290.215044,292.424887,295.493358,285.617856,...,285.374168,285.400406,280.34301,283.779823,291.521986,294.512307,296.497276,295.266398,294.094803,293.184253
std,,6.640131,7.452438,5.332862,6.547986,6.460823,5.889992,10.829522,9.916743,9.853484,...,10.242377,10.220932,11.953626,9.802499,7.821815,6.676412,8.852984,6.324566,6.304118,7.093583
min,,245.15,262.37,272.3,263.78,266.503667,265.783333,260.561333,266.059,255.042333,...,250.39,250.774,243.3,249.54,272.179,271.049,271.15,271.15,268.682,272.974
25%,,279.16,279.85,284.67,279.83,286.38,286.25475,283.92,287.68,277.97,...,277.350636,277.37,271.97175,276.09,285.366623,289.45,289.734,290.467333,289.881833,287.524279
50%,,283.45,284.32,287.61,283.94,290.53,290.11875,292.027486,295.586667,286.12,...,285.927583,285.87,281.109,284.13325,290.932667,294.9,296.15,294.82,294.15,292.996
75%,,288.600785,289.45175,291.015167,288.53,295.08,294.107542,300.835,303.05,292.835643,...,293.796,293.76,290.369583,291.62,297.27,299.8,303.15,299.66,298.93,299.15


In [3]:
def group_by_days(df):
    df["date"] = pd.to_datetime(df["datetime"]).dt.date
    df = df.drop('datetime', axis=1)
    df = df.groupby(df["date"]).mean()
    return df

def group_by_days_descript(df):
    df["date"] = pd.to_datetime(df["datetime"]).dt.date
    df = df.fillna('no data')
    df = df.drop('datetime', axis=1)
    df = df.groupby(df["date"]).agg(lambda x: pd.Series.mode(x)[0])
    return df

In [4]:
df_humidity = group_by_days(df_humidity)
df_pressure = group_by_days(df_pressure)
df_temper = group_by_days(df_temper)
df_descript = group_by_days_descript(df_descript)
df_direction = group_by_days(df_direction)
df_speed = group_by_days(df_speed)

In [5]:
encoder = LabelEncoder()
df_descript= df_descript.apply(encoder.fit_transform)

In [6]:
def create_dataframe_for_city(city_name):
    df =  pd.concat([
        df_humidity[city_name], df_pressure[city_name], df_temper[city_name], df_descript[city_name], df_direction[city_name], df_speed[city_name]
    ], axis=1)
    df.columns = ['humidity', 'pressure', 'temperature', 'description', 'wind_direction','wind_speed' ]
    return df

In [7]:
df_Portland  =  create_dataframe_for_city("Portland")

In [9]:
df_Portland

Unnamed: 0_level_0,humidity,pressure,temperature,description,wind_direction,wind_speed
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2012-10-01,78.727273,1024.000000,282.118197,11,57.727273,0.000000
2012-10-02,65.833333,1023.583333,286.137728,11,214.041667,1.291667
2012-10-03,66.208333,1021.083333,289.599792,2,228.333333,2.625000
2012-10-04,51.166667,1022.875000,286.482500,12,206.750000,4.625000
2012-10-05,40.391304,1022.916667,288.286042,12,182.250000,3.708333
...,...,...,...,...,...,...
2017-11-26,87.625000,1004.500000,281.588333,8,153.750000,4.958333
2017-11-27,92.875000,1015.625000,279.922500,8,158.083333,1.666667
2017-11-28,88.458333,1019.791667,280.085833,8,144.166667,3.041667
2017-11-29,85.500000,1027.625000,279.860833,8,191.125000,2.166667


# Przewidywanie temperatury

In [8]:
X = df_Portland.iloc[:-2].copy()
y =  df_Portland['temperature'].iloc[4:] 

X.reset_index(inplace= True)
X['day_of_year'] = pd.to_datetime(X['date']).dt.dayofyear
X['month'] = pd.to_datetime(X['date']).dt.month
# X['day_sin'] = np.sin(X['day_of_year']/365 * 2 * np.pi)
# X['day_cos'] = np.cos(X['day_of_year']/365 * 2 * np.pi)
# X = X.drop(columns = ['day_of_year'])

X = X.drop('date', axis=1)
column_names = X.columns.to_list()
X= pd.concat([X.iloc[:-2].reset_index(drop=True),X.iloc[1:-1].reset_index(drop=True),X.iloc[2:].reset_index(drop=True)],axis = 1)
X.columns = column_names + [c + '_1' for c in column_names] + [c + '_2' for c in column_names]
X

Unnamed: 0,humidity,pressure,temperature,description,wind_direction,wind_speed,day_of_year,month,humidity_1,pressure_1,...,day_of_year_1,month_1,humidity_2,pressure_2,temperature_2,description_2,wind_direction_2,wind_speed_2,day_of_year_2,month_2
0,78.727273,1024.000000,282.118197,11,57.727273,0.000000,275,10,65.833333,1023.583333,...,276,10,66.208333,1021.083333,289.599792,2,228.333333,2.625000,277,10
1,65.833333,1023.583333,286.137728,11,214.041667,1.291667,276,10,66.208333,1021.083333,...,277,10,51.166667,1022.875000,286.482500,12,206.750000,4.625000,278,10
2,66.208333,1021.083333,289.599792,2,228.333333,2.625000,277,10,51.166667,1022.875000,...,278,10,40.391304,1022.916667,288.286042,12,182.250000,3.708333,279,10
3,51.166667,1022.875000,286.482500,12,206.750000,4.625000,278,10,40.391304,1022.916667,...,279,10,40.750000,1023.333333,288.291042,12,134.041667,3.666667,280,10
4,40.391304,1022.916667,288.286042,12,182.250000,3.708333,279,10,40.750000,1023.333333,...,280,10,36.681818,1021.000000,288.262500,12,119.166667,4.500000,281,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1878,94.083333,1016.791667,285.447917,8,120.000000,5.583333,326,11,88.250000,1013.833333,...,327,11,83.083333,1019.166667,282.543750,0,199.583333,2.791667,328,11
1879,88.250000,1013.833333,288.386667,7,174.166667,4.666667,327,11,83.083333,1019.166667,...,328,11,87.291667,1017.541667,279.293333,8,140.500000,2.500000,329,11
1880,83.083333,1019.166667,282.543750,0,199.583333,2.791667,328,11,87.291667,1017.541667,...,329,11,87.625000,1004.500000,281.588333,8,153.750000,4.958333,330,11
1881,87.291667,1017.541667,279.293333,8,140.500000,2.500000,329,11,87.625000,1004.500000,...,330,11,92.875000,1015.625000,279.922500,8,158.083333,1.666667,331,11


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
           X, y, test_size = 0.3, random_state = 0,shuffle=False)

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

ct = ColumnTransformer(
    [("standarizer", StandardScaler(), ['humidity', 'pressure', 'temperature', 'wind_direction', 'wind_speed',
                                        'humidity_1', 'pressure_1', 'temperature_1', 'wind_direction_1', 'wind_speed_1',
                                        'humidity_2', 'pressure_2', 'temperature_2', 'wind_direction_2', 'wind_speed_2'
                                        ]),
    ("one_hot_encoder", OneHotEncoder(), ['description', 'description_1', 'description_2', 'month', 'month_1', 'month_2'])],
    remainder=StandardScaler())
X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)

In [46]:
dataset = torch.utils.data.TensorDataset(
    torch.from_numpy(X_train.toarray().astype('float32')), 
    torch.from_numpy(y_train.to_numpy().astype('float32')).unsqueeze(1)
)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=128, shuffle=False)
net = P1_Net(
    nn.Sequential(
        nn.Linear(99, 500),
        nn.ReLU(),
        nn.Linear(500,1)
    )
)
print(net)

optimizer = torch.optim.Adam(net.parameters(), lr=0.001)
loss = nn.MSELoss()

do_train(net, data_loader, optimizer, loss, 1000)

P1_Net(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=99, out_features=500, bias=True)
    (1): ReLU()
    (2): Linear(in_features=500, out_features=1, bias=True)
  )
)
[1,     1] loss: 78393.672
[1,     2] loss: 80700.008
[1,     3] loss: 84055.844
[1,     4] loss: 77418.531
[1,     5] loss: 81745.375
[1,     6] loss: 83926.656
[1,     7] loss: 77158.516
[1,     8] loss: 82787.547
[1,     9] loss: 82652.789
[1,    10] loss: 78244.195
[1,    11] loss: 82363.516
[2,     1] loss: 77751.766
[2,     2] loss: 80204.875
[2,     3] loss: 83466.797
[2,     4] loss: 76830.078
[2,     5] loss: 81236.422
[2,     6] loss: 83334.578
[2,     7] loss: 76502.867
[2,     8] loss: 82175.047
[2,     9] loss: 82043.781
[2,    10] loss: 77642.109
[2,    11] loss: 81776.156
[3,     1] loss: 77088.727
[3,     2] loss: 79638.477
[3,     3] loss: 82769.578
[3,     4] loss: 76143.133
[3,     5] loss: 80597.000
[3,     6] loss: 82617.062
[3,     7] loss: 75744.992
[3,     8] loss: 81396.328
[3, 

In [13]:
def abs_error(A,B):
    err = 0
    for i in range(len(A)):
        err = max(err, abs(A.iloc[i] - B[i]))
    return err

def valid_in_margin(y1, y2, margin):
    return 1 - np.count_nonzero(abs(y1-y2)>margin) / len(y1)


In [47]:
from sklearn.metrics import mean_absolute_error
net.eval()

with torch.no_grad():
    y_pred1 = net(torch.from_numpy(X_train.toarray().astype('float32'))).numpy()
    y_pred1 = np.squeeze(y_pred1)
err = abs_error(y_train, y_pred1)
print("Max absolute error")
print(err)

print("Mean absolute error")
print(mean_absolute_error(y_train, y_pred1))
margin = 2
print(f"Fraction of predictions with absolute error <= {margin}")
print(valid_in_margin(y_pred1, y_train, margin))

Max absolute error
8.202107967843745
Mean absolute error
1.7338631084457947
Fraction of predictions with absolute error <= 2
0.6555386949924127


In [48]:
from sklearn.metrics import mean_absolute_error
with torch.no_grad():
    y_pred2 = net(torch.from_numpy(X_test.toarray().astype('float32'))).numpy()
    y_pred2 = np.squeeze(y_pred2)
err2 = abs_error(y_test, y_pred2)
print("Max absolute error")
print(err2)

print("Mean absolute error")
print(mean_absolute_error(y_test, y_pred2))
margin = 2
print(f"Fraction of predictions with absolute error <= {margin}")
print(valid_in_margin(y_pred2, y_test, margin))

Max absolute error
12.493049316406257
Mean absolute error
2.9308896483713966
Fraction of predictions with absolute error <= 2
0.4424778761061947


## Przewidywanie siły wiatru

In [49]:
df_speed_2 = pd.read_csv("wind_speed.csv")
df_speed_2["date"] = pd.to_datetime(df_speed_2["datetime"]).dt.date
df_speed_2 = df_speed_2.drop(columns=['datetime'])
df_speed_2 = df_speed_2.groupby('date').apply(lambda group: (group >= 6).any())

In [50]:
X_strong_wind = df_speed_2['Portland'].apply(lambda x: 1 if x else 0)
X = df_Portland.copy()
X['strong_wind'] = X_strong_wind
X = X.iloc[:-2]
y = X_strong_wind.iloc[4:] 

X.reset_index(inplace= True)
X['day_of_year'] = pd.to_datetime(X['date']).dt.dayofyear
X['month'] = pd.to_datetime(X['date']).dt.month

X = X.drop('date', axis=1)
column_names = X.columns.to_list()
X= pd.concat([X.iloc[:-2].reset_index(drop=True),X.iloc[1:-1].reset_index(drop=True),X.iloc[2:].reset_index(drop=True)],axis = 1)
X.columns = column_names + [c + '_1' for c in column_names] + [c + '_2' for c in column_names]
X

Unnamed: 0,humidity,pressure,temperature,description,wind_direction,wind_speed,strong_wind,day_of_year,month,humidity_1,...,month_1,humidity_2,pressure_2,temperature_2,description_2,wind_direction_2,wind_speed_2,strong_wind_2,day_of_year_2,month_2
0,78.727273,1024.000000,282.118197,11,57.727273,0.000000,0,275,10,65.833333,...,10,66.208333,1021.083333,289.599792,2,228.333333,2.625000,0,277,10
1,65.833333,1023.583333,286.137728,11,214.041667,1.291667,1,276,10,66.208333,...,10,51.166667,1022.875000,286.482500,12,206.750000,4.625000,1,278,10
2,66.208333,1021.083333,289.599792,2,228.333333,2.625000,0,277,10,51.166667,...,10,40.391304,1022.916667,288.286042,12,182.250000,3.708333,1,279,10
3,51.166667,1022.875000,286.482500,12,206.750000,4.625000,1,278,10,40.391304,...,10,40.750000,1023.333333,288.291042,12,134.041667,3.666667,1,280,10
4,40.391304,1022.916667,288.286042,12,182.250000,3.708333,1,279,10,40.750000,...,10,36.681818,1021.000000,288.262500,12,119.166667,4.500000,1,281,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1878,94.083333,1016.791667,285.447917,8,120.000000,5.583333,1,326,11,88.250000,...,11,83.083333,1019.166667,282.543750,0,199.583333,2.791667,1,328,11
1879,88.250000,1013.833333,288.386667,7,174.166667,4.666667,1,327,11,83.083333,...,11,87.291667,1017.541667,279.293333,8,140.500000,2.500000,0,329,11
1880,83.083333,1019.166667,282.543750,0,199.583333,2.791667,1,328,11,87.291667,...,11,87.625000,1004.500000,281.588333,8,153.750000,4.958333,1,330,11
1881,87.291667,1017.541667,279.293333,8,140.500000,2.500000,0,329,11,87.625000,...,11,92.875000,1015.625000,279.922500,8,158.083333,1.666667,0,331,11


In [51]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

ct_wind = ColumnTransformer(
    [("standarizer", StandardScaler(), ['humidity', 'pressure', 'temperature', 'wind_direction', 'wind_speed',
                                        'humidity_1', 'pressure_1', 'temperature_1', 'wind_direction_1', 'wind_speed_1',
                                        'humidity_2', 'pressure_2', 'temperature_2', 'wind_direction_2', 'wind_speed_2'
                                        ]),
    ("one_hot_encoder", OneHotEncoder(), ['description', 'description_1', 'description_2', 'month', 'month_1', 'month_2']),
    ('pass', 'passthrough', ['strong_wind', 'strong_wind_1', 'strong_wind_2'])],
    remainder=StandardScaler())

X_train, X_test, y_train, y_test = train_test_split(
           X, y, test_size = 0.3, random_state = 0,shuffle=False)
X_train = ct_wind.fit_transform(X_train)
X_test = ct_wind.transform(X_test)
    
net = MLPClassifier(random_state=0, max_iter=1000, activation ='relu', learning_rate = 'adaptive', learning_rate_init = 0.001, hidden_layer_sizes=1000)

In [62]:
y_0 = np.array([1 if i==0 else 0 for i in y_train])
y_t = np.concat([np.expand_dims(y_0, axis=1),np.expand_dims(y_train, axis=1)],axis=1)
dataset = torch.utils.data.TensorDataset(
    torch.from_numpy(X_train.toarray().astype('float32')), 
    torch.from_numpy(y_t.astype('float32'))
)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=128, shuffle=False)
net = P1_Net(
    nn.Sequential(
        nn.Linear(102, 500),
        nn.ReLU(),
        nn.Linear(500,2),
        nn.Sigmoid()
    )
)
print(net)

optimizer = torch.optim.Adam(net.parameters(), lr=0.001)
loss = nn.CrossEntropyLoss()

do_train(net, data_loader, optimizer, loss, 1000)

(565, 102)
P1_Net(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=102, out_features=500, bias=True)
    (1): ReLU()
    (2): Linear(in_features=500, out_features=2, bias=True)
    (3): Sigmoid()
  )
)
[1,     1] loss: 0.694
[1,     2] loss: 0.697
[1,     3] loss: 0.700
[1,     4] loss: 0.688
[1,     5] loss: 0.680
[1,     6] loss: 0.642
[1,     7] loss: 0.612
[1,     8] loss: 0.630
[1,     9] loss: 0.668
[1,    10] loss: 0.707
[1,    11] loss: 0.677
[2,     1] loss: 0.684
[2,     2] loss: 0.655
[2,     3] loss: 0.564
[2,     4] loss: 0.633
[2,     5] loss: 0.639
[2,     6] loss: 0.521
[2,     7] loss: 0.499
[2,     8] loss: 0.540
[2,     9] loss: 0.642
[2,    10] loss: 0.721
[2,    11] loss: 0.670
[3,     1] loss: 0.686
[3,     2] loss: 0.636
[3,     3] loss: 0.484
[3,     4] loss: 0.599
[3,     5] loss: 0.612
[3,     6] loss: 0.447
[3,     7] loss: 0.429
[3,     8] loss: 0.485
[3,     9] loss: 0.629
[3,    10] loss: 0.726
[3,    11] loss: 0.667
[4,     1] loss: 0.685
[

In [74]:
from sklearn.metrics import roc_auc_score
net.eval()
with torch.no_grad():
    y_pred1 = net(torch.from_numpy(X_train.toarray().astype('float32')))[:,1]
print(roc_auc_score(y_train, y_pred1))

0.9083858824622909


In [76]:
from sklearn.metrics import roc_auc_score
net.eval()
with torch.no_grad():
    y_pred1 = net(torch.from_numpy(X_test.toarray().astype('float32')))[:,1]
print(roc_auc_score(y_test, y_pred1))

0.621027397260274
