In [1]:
import numpy as np
import pandas as pd
import datetime    
import torch
import torch.nn as nn
import torch.utils.data.dataloader
from neural_net import P1_Net, do_train

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.utils.validation import column_or_1d

In [2]:
df_city = pd.read_csv("city_attributes.csv")
df_humidity = pd.read_csv("humidity.csv")
df_pressure = pd.read_csv("pressure.csv")
df_temper = pd.read_csv("temperature.csv")
df_descript = pd.read_csv("weather_description.csv")
df_direction = pd.read_csv("wind_direction.csv")
df_speed = pd.read_csv("wind_speed.csv")

In [3]:
max([df_humidity.isna().sum().mean(),
df_pressure.isna().sum().mean(),
df_temper.isna().sum().mean(),
df_direction.isna().sum().mean(),
df_speed.isna().sum().mean()])

774.3513513513514

In [4]:
df_temper.describe(include='all')

Unnamed: 0,datetime,Vancouver,Portland,San Francisco,Seattle,Los Angeles,San Diego,Las Vegas,Phoenix,Albuquerque,...,Philadelphia,New York,Montreal,Boston,Beersheba,Tel Aviv District,Eilat,Haifa,Nahariyya,Jerusalem
count,45253,44458.0,45252.0,44460.0,45250.0,45250.0,45252.0,45252.0,45250.0,45252.0,...,45250.0,44460.0,45250.0,45250.0,44455.0,44460.0,44461.0,44455.0,44456.0,44460.0
unique,45253,,,,,,,,,,...,,,,,,,,,,
top,2012-10-01 12:00:00,,,,,,,,,,...,,,,,,,,,,
freq,1,,,,,,,,,,...,,,,,,,,,,
mean,,283.862654,284.992929,288.155821,284.409626,290.846116,290.215044,292.424887,295.493358,285.617856,...,285.374168,285.400406,280.34301,283.779823,291.521986,294.512307,296.497276,295.266398,294.094803,293.184253
std,,6.640131,7.452438,5.332862,6.547986,6.460823,5.889992,10.829522,9.916743,9.853484,...,10.242377,10.220932,11.953626,9.802499,7.821815,6.676412,8.852984,6.324566,6.304118,7.093583
min,,245.15,262.37,272.3,263.78,266.503667,265.783333,260.561333,266.059,255.042333,...,250.39,250.774,243.3,249.54,272.179,271.049,271.15,271.15,268.682,272.974
25%,,279.16,279.85,284.67,279.83,286.38,286.25475,283.92,287.68,277.97,...,277.350636,277.37,271.97175,276.09,285.366623,289.45,289.734,290.467333,289.881833,287.524279
50%,,283.45,284.32,287.61,283.94,290.53,290.11875,292.027486,295.586667,286.12,...,285.927583,285.87,281.109,284.13325,290.932667,294.9,296.15,294.82,294.15,292.996
75%,,288.600785,289.45175,291.015167,288.53,295.08,294.107542,300.835,303.05,292.835643,...,293.796,293.76,290.369583,291.62,297.27,299.8,303.15,299.66,298.93,299.15


In [5]:
def group_by_days(df):
    df["date"] = pd.to_datetime(df["datetime"]).dt.date
    df = df.drop('datetime', axis=1)
    df = df.groupby(df["date"]).mean()
    return df

def group_by_days_descript(df):
    df["date"] = pd.to_datetime(df["datetime"]).dt.date
    df = df.fillna('no data')
    df = df.drop('datetime', axis=1)
    df = df.groupby(df["date"]).agg(lambda x: pd.Series.mode(x)[0])
    return df

In [6]:
df_humidity = group_by_days(df_humidity)
df_pressure = group_by_days(df_pressure)
df_temper = group_by_days(df_temper)
df_descript = group_by_days_descript(df_descript)
df_direction = group_by_days(df_direction)
df_speed = group_by_days(df_speed)

In [7]:
encoder = LabelEncoder()
df_descript= df_descript.apply(encoder.fit_transform)

In [8]:
def create_dataframe_for_city(city_name):
    df =  pd.concat([
        df_humidity[city_name], df_pressure[city_name], df_temper[city_name], df_descript[city_name], df_direction[city_name], df_speed[city_name]
    ], axis=1)
    df.columns = ['humidity', 'pressure', 'temperature', 'description', 'wind_direction','wind_speed' ]
    return df

In [9]:
df_Portland  =  create_dataframe_for_city("Portland")
df_Portland

Unnamed: 0_level_0,humidity,pressure,temperature,description,wind_direction,wind_speed
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2012-10-01,78.727273,1024.000000,282.118197,11,57.727273,0.000000
2012-10-02,65.833333,1023.583333,286.137728,11,214.041667,1.291667
2012-10-03,66.208333,1021.083333,289.599792,2,228.333333,2.625000
2012-10-04,51.166667,1022.875000,286.482500,12,206.750000,4.625000
2012-10-05,40.391304,1022.916667,288.286042,12,182.250000,3.708333
...,...,...,...,...,...,...
2017-11-26,87.625000,1004.500000,281.588333,8,153.750000,4.958333
2017-11-27,92.875000,1015.625000,279.922500,8,158.083333,1.666667
2017-11-28,88.458333,1019.791667,280.085833,8,144.166667,3.041667
2017-11-29,85.500000,1027.625000,279.860833,8,191.125000,2.166667


In [10]:
df_speed_2 = pd.read_csv("wind_speed.csv")
df_speed_2["date"] = pd.to_datetime(df_speed_2["datetime"]).dt.date
df_speed_2 = df_speed_2.drop(columns=['datetime'])
df_speed_2 = df_speed_2.groupby('date').apply(lambda group: (group >= 6).any())
strong_wind = pd.concat([
    df_speed_2[city] for city in df_city['City']
])

def add_city_col(df, city):
    df['city'] = city
    return df
all_cities_df = pd.concat([
    add_city_col(create_dataframe_for_city(city), city) for city in df_city['City']
])
all_cities_df['strong_wind'] = strong_wind
all_cities_df['strong_wind'] = all_cities_df['strong_wind'].apply(lambda x: 1 if x else 0)
all_cities_df = all_cities_df.dropna().sort_values('date', kind='stable')

In [11]:
all_cities_df

Unnamed: 0_level_0,humidity,pressure,temperature,description,wind_direction,wind_speed,city,strong_wind
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2012-10-01,78.727273,1024.000000,282.118197,11,57.727273,0.000000,Portland,0
2012-10-01,83.000000,1009.727273,289.416642,14,122.363636,1.636364,San Francisco,0
2012-10-01,78.000000,1030.000000,281.767262,1,32.272727,0.000000,Seattle,0
2012-10-01,88.000000,1013.000000,291.846501,11,0.000000,0.000000,Los Angeles,0
2012-10-01,79.909091,1013.000000,291.573495,11,0.000000,0.000000,San Diego,0
...,...,...,...,...,...,...,...,...
2017-11-30,42.000000,1025.000000,279.190000,12,360.000000,2.000000,Pittsburgh,0
2017-11-30,60.000000,1027.000000,274.510000,17,330.000000,3.000000,Toronto,0
2017-11-30,32.000000,1024.000000,283.420000,11,360.000000,4.000000,Philadelphia,0
2017-11-30,58.000000,1027.000000,271.800000,1,300.000000,4.000000,Montreal,0


# Przewidywanie temperatury

In [12]:
def extract_x_y_temperature(df):
    X = df.iloc[:-2].copy()
    y =  df['temperature'].iloc[4:] 

    X.reset_index(inplace= True)
    X = X.drop(columns = ['strong_wind'])
    X['day_of_year'] = pd.to_datetime(X['date']).dt.dayofyear
    X['month'] = pd.to_datetime(X['date']).dt.month

    # X['wind_dir_sin'] = np.sin(X['wind_direction']/360 * 2 * np.pi)
    # X['wind_dir_cos'] = np.cos(X['wind_direction']/360 * 2 * np.pi)
    # X['day_sin'] = np.sin(X['day_of_year']/365 * 2 * np.pi)
    # X['day_cos'] = np.cos(X['day_of_year']/365 * 2 * np.pi)
    # X = X.drop(columns = ['day_of_year'])
    # X = X.drop(columns = ['wind_direction'])

    X = X.drop('date', axis=1)
    column_names = X.columns.to_list()
    X = pd.concat([X.iloc[:-2].reset_index(drop=True),X.iloc[1:-1].reset_index(drop=True),X.iloc[2:].reset_index(drop=True)],axis = 1)
    X.columns = column_names + [c + '_1' for c in column_names] + [c + '_2' for c in column_names]
    if 'city_1' in X.columns:
        X = X.drop(columns=['city_1', 'city_2'])
    return X, y

In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

def gen_col_set(col_names):
    return [ final_col
        for col_name in col_names
        for final_col in  [col_name, col_name+"_1", col_name+"_2"] ]

def get_col_transformer_temp(df):
    return ColumnTransformer(
        [
            (
                "standarizer",
                StandardScaler(),
                gen_col_set(
                    ["humidity", "pressure", "temperature", "wind_speed", "wind_direction", "day_of_year"]
                ),
            ),
            (
                "one_hot_encoder", 
                OneHotEncoder(), 
                gen_col_set(
                    ["description", "month"]
                ) + (['city'] if 'city' in df.columns else [])
            ),
        ],
        remainder="passthrough",
    )

def prepare_temperature_data(df):
    X, y = extract_x_y_temperature(df)
    X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size = 0.3, random_state = 0,shuffle=False)
    ct = get_col_transformer_temp(X_train)
    X_train = ct.fit_transform(X_train)
    X_test = ct.transform(X_test)
    return X_train, X_test, y_train, y_test

In [14]:
X_train, X_test, y_train, y_test = prepare_temperature_data(all_cities_df)
X_train.shape

(47273, 150)

In [20]:

dataset = torch.utils.data.TensorDataset(
    torch.from_numpy(X_train.toarray().astype('float32')), 
    torch.from_numpy(y_train.to_numpy().astype('float32')).unsqueeze(1)
)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=512, shuffle=False)
net = P1_Net(
    nn.Sequential(
        nn.Linear(X_train.shape[1], 256),
        nn.ReLU(),
        nn.Linear(256,256),
        nn.ReLU(),
        nn.Linear(256,256),
        nn.ReLU(),
        nn.Linear(256,1)
    )
)
print(net)

optimizer = torch.optim.Adam(net.parameters(), lr=0.0001)
loss = nn.MSELoss()

do_train(net, data_loader, optimizer, loss, 400)

P1_Net(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=150, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=256, bias=True)
    (5): ReLU()
    (6): Linear(in_features=256, out_features=1, bias=True)
  )
)
[1] loss: 82165.267
[2] loss: 69242.410
[3] loss: 19230.297
[4] loss: 674.919
[5] loss: 485.702
[6] loss: 387.272
[7] loss: 329.477
[8] loss: 289.511
[9] loss: 258.787
[10] loss: 233.679
[11] loss: 212.394
[12] loss: 193.993
[13] loss: 177.842
[14] loss: 163.546
[15] loss: 150.837
[16] loss: 139.490
[17] loss: 129.329
[18] loss: 120.181
[19] loss: 111.925
[20] loss: 104.474
[21] loss: 97.702
[22] loss: 91.537
[23] loss: 85.889
[24] loss: 80.708
[25] loss: 75.936
[26] loss: 71.538
[27] loss: 67.481
[28] loss: 63.729
[29] loss: 60.256
[30] loss: 57.042
[31] loss: 54.096
[32] loss: 51.392
[33] loss: 48.932
[34] loss: 46.700
[35] loss: 44.683
[36] loss:

In [21]:
from sklearn.metrics import mean_absolute_error
net.eval()

def valid_in_margin(y1, y2, margin):
    return 1 - np.count_nonzero(abs(y1-y2)>margin) / len(y1)
print("* train")
with torch.no_grad():
    y_pred1 = net(torch.from_numpy(X_train.toarray().astype('float32'))).numpy()
    y_pred1 = np.squeeze(y_pred1)

mae = mean_absolute_error(y_train, y_pred1)
print(f"\t* Mean absolute error: {mae}")
margin = 2
fraction = valid_in_margin(y_pred1, y_train, margin)
print(f"\t* Fraction of predictions with absolute error <= {margin}: {fraction}")

print("* test")
with torch.no_grad():
    y_pred2 = net(torch.from_numpy(X_test.toarray().astype('float32'))).numpy()
    y_pred2 = np.squeeze(y_pred2)

mae = mean_absolute_error(y_test, y_pred2)
print(f"\t* Mean absolute error: {mae}")
margin = 2
fraction = valid_in_margin(y_pred2, y_test, margin)
print(f"\t* Fraction of predictions with absolute error <= {margin}: {fraction}")

* train
	* Mean absolute error: 3.885937565076981
	* Fraction of predictions with absolute error <= 2: 0.2896579442810907
* test
	* Mean absolute error: 4.373468597912471
	* Fraction of predictions with absolute error <= 2: 0.23947485316618133


In [28]:
X_train, X_test, y_train, y_test = prepare_temperature_data(all_cities_df)
X_train.shape
dataset = torch.utils.data.TensorDataset(
    torch.from_numpy(X_train.toarray().astype('float32')), 
    torch.from_numpy(y_train.to_numpy().astype('float32')).unsqueeze(1)
)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=512, shuffle=False)
net = P1_Net(
    nn.Sequential(
        nn.Linear(X_train.shape[1], 256),
        nn.ReLU(),
         nn.Dropout(p=0.5),
        nn.Linear(256,1)
    )
)
print(net)

optimizer = torch.optim.Adam(net.parameters(), lr=0.0001)
loss = nn.MSELoss()

do_train(net, data_loader, optimizer, loss, 400)

P1_Net(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=150, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=256, out_features=1, bias=True)
  )
)
[1] loss: 82691.454
[2] loss: 82415.487
[3] loss: 82050.061
[4] loss: 81539.956
[5] loss: 80849.777
[6] loss: 79950.222
[7] loss: 78829.962
[8] loss: 77480.513
[9] loss: 75903.258
[10] loss: 74101.628
[11] loss: 72069.128
[12] loss: 69837.100
[13] loss: 67397.638
[14] loss: 64811.151
[15] loss: 62077.972
[16] loss: 59240.311
[17] loss: 56271.662
[18] loss: 53243.073
[19] loss: 50182.315
[20] loss: 47083.287
[21] loss: 43969.490
[22] loss: 40903.803
[23] loss: 37851.140
[24] loss: 34849.744
[25] loss: 31926.627
[26] loss: 29061.138
[27] loss: 26352.617
[28] loss: 23756.903
[29] loss: 21280.292
[30] loss: 18967.704
[31] loss: 16780.856
[32] loss: 14754.517
[33] loss: 12905.610
[34] loss: 11189.308
[35] loss: 9613.042
[36] loss: 8230.088
[37] loss: 6973.417
[38] loss

In [29]:
from sklearn.metrics import mean_absolute_error
net.eval()

def valid_in_margin(y1, y2, margin):
    return 1 - np.count_nonzero(abs(y1-y2)>margin) / len(y1)
print("* train")
with torch.no_grad():
    y_pred1 = net(torch.from_numpy(X_train.toarray().astype('float32'))).numpy()
    y_pred1 = np.squeeze(y_pred1)

mae = mean_absolute_error(y_train, y_pred1)
print(f"\t* Mean absolute error: {mae}")
margin = 2
fraction = valid_in_margin(y_pred1, y_train, margin)
print(f"\t* Fraction of predictions with absolute error <= {margin}: {fraction}")

print("* test")
with torch.no_grad():
    y_pred2 = net(torch.from_numpy(X_test.toarray().astype('float32'))).numpy()
    y_pred2 = np.squeeze(y_pred2)

mae = mean_absolute_error(y_test, y_pred2)
print(f"\t* Mean absolute error: {mae}")
margin = 2
fraction = valid_in_margin(y_pred2, y_test, margin)
print(f"\t* Fraction of predictions with absolute error <= {margin}: {fraction}")

* train
	* Mean absolute error: 2.8950658878883457
	* Fraction of predictions with absolute error <= 2: 0.44014553762189834
* test
	* Mean absolute error: 2.8562665584401485
	* Fraction of predictions with absolute error <= 2: 0.44760870638171857


In [32]:
X_train, X_test, y_train, y_test = prepare_temperature_data(all_cities_df)
X_train.shape
dataset = torch.utils.data.TensorDataset(
    torch.from_numpy(X_train.toarray().astype('float32')), 
    torch.from_numpy(y_train.to_numpy().astype('float32')).unsqueeze(1)
)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=512, shuffle=False)
net = P1_Net(
    nn.Sequential(
        nn.Linear(X_train.shape[1], 256),
        nn.ReLU(),
        nn.Linear(256, 128),
        nn.ReLU(),
        nn.Linear(128, 64),
        nn.ReLU(),
        nn.Linear(64,1)
    )
)
print(net)

optimizer = torch.optim.Adam(net.parameters(), lr=0.0001)
loss = nn.MSELoss()

do_train(net, data_loader, optimizer, loss, 400)

P1_Net(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=150, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=64, bias=True)
    (5): ReLU()
    (6): Linear(in_features=64, out_features=1, bias=True)
  )
)
[1] loss: 82645.218
[2] loss: 81294.396
[3] loss: 72864.593
[4] loss: 47824.027
[5] loss: 14466.623
[6] loss: 1031.784
[7] loss: 233.416
[8] loss: 183.751
[9] loss: 150.614
[10] loss: 129.115
[11] loss: 114.331
[12] loss: 103.595
[13] loss: 95.556
[14] loss: 88.878
[15] loss: 83.192
[16] loss: 78.423
[17] loss: 74.215
[18] loss: 70.438
[19] loss: 67.001
[20] loss: 63.846
[21] loss: 60.928
[22] loss: 58.212
[23] loss: 55.674
[24] loss: 53.294
[25] loss: 51.062
[26] loss: 48.960
[27] loss: 46.987
[28] loss: 45.134
[29] loss: 43.396
[30] loss: 41.766
[31] loss: 40.240
[32] loss: 38.809
[33] loss: 37.476
[34] loss: 36.216
[35] loss: 35.005
[36] loss: 33.8

In [33]:
from sklearn.metrics import mean_absolute_error
net.eval()

def valid_in_margin(y1, y2, margin):
    return 1 - np.count_nonzero(abs(y1-y2)>margin) / len(y1)
print("* train")
with torch.no_grad():
    y_pred1 = net(torch.from_numpy(X_train.toarray().astype('float32'))).numpy()
    y_pred1 = np.squeeze(y_pred1)

mae = mean_absolute_error(y_train, y_pred1)
print(f"\t* Mean absolute error: {mae}")
margin = 2
fraction = valid_in_margin(y_pred1, y_train, margin)
print(f"\t* Fraction of predictions with absolute error <= {margin}: {fraction}")

print("* test")
with torch.no_grad():
    y_pred2 = net(torch.from_numpy(X_test.toarray().astype('float32'))).numpy()
    y_pred2 = np.squeeze(y_pred2)

mae = mean_absolute_error(y_test, y_pred2)
print(f"\t* Mean absolute error: {mae}")
margin = 2
fraction = valid_in_margin(y_pred2, y_test, margin)
print(f"\t* Fraction of predictions with absolute error <= {margin}: {fraction}")

* train
	* Mean absolute error: 3.420493176038504
	* Fraction of predictions with absolute error <= 2: 0.36335751909123604
* test
	* Mean absolute error: 3.231835450897093
	* Fraction of predictions with absolute error <= 2: 0.40027639307043084


## Przewidywanie siły wiatru

In [22]:
def extract_x_y_wind(df):
    X = df.copy()
    X = X.iloc[:-2]
    y = df['strong_wind'].iloc[4:] 

    X.reset_index(inplace= True)
    X['day_of_year'] = pd.to_datetime(X['date']).dt.dayofyear
    X['month'] = pd.to_datetime(X['date']).dt.month
    # X['day_sin'] = np.sin(X['day_of_year']/365 * 2 * np.pi)
    # X['day_cos'] = np.cos(X['day_of_year']/365 * 2 * np.pi)
    # X = X.drop(columns = ['day_of_year'])


    X = X.drop('date', axis=1)
    column_names = X.columns.to_list()
    X= pd.concat([X.iloc[:-2].reset_index(drop=True),X.iloc[1:-1].reset_index(drop=True),X.iloc[2:].reset_index(drop=True)],axis = 1)
    X.columns = column_names + [c + '_1' for c in column_names] + [c + '_2' for c in column_names]
    if 'city_1' in X.columns:
        X = X.drop(columns=['city_1', 'city_2'])
    return X, y

In [23]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

def get_col_transformer_wind(df):
    return ColumnTransformer(
        [
            (
                "standarizer",
                StandardScaler(),
                gen_col_set(
                    ["humidity", "pressure", "temperature", "wind_speed", "wind_direction", "day_of_year"]
                ),
            ),
            (
                "one_hot_encoder", 
                OneHotEncoder(), 
                gen_col_set(
                    ["description", "month"]
                ) + (['city'] if 'city' in df.columns else [])
            ),
            ('pass', 'passthrough', gen_col_set(['strong_wind'])),
        ],
        remainder="passthrough",
    )

def prepare_wind_data(df):
    X, y = extract_x_y_wind(df)
    X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size = 0.3, random_state = 0,shuffle=False)
    ct = get_col_transformer_wind(X_train)
    X_train = ct.fit_transform(X_train)
    X_test = ct.transform(X_test)
    return X_train, X_test, y_train, y_test

In [24]:
X_train, X_test, y_train, y_test = prepare_wind_data(all_cities_df)
X_train.shape

(47273, 153)

In [26]:
y_0 = np.array([1 if i==0 else 0 for i in y_train])
y_t = np.concatenate([np.expand_dims(y_0, axis=1),np.expand_dims(y_train, axis=1)],axis=1)
dataset = torch.utils.data.TensorDataset(
    torch.from_numpy(X_train.toarray().astype('float32')), 
    torch.from_numpy(y_t.astype('float32'))
)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=512, shuffle=False)
net = P1_Net(
    nn.Sequential(
        nn.Linear(X_train.shape[1], 256),
        nn.ReLU(),
        nn.Linear(256,256),
        nn.ReLU(),
        nn.Linear(256,256),
        nn.ReLU(),
        nn.Linear(256,2),
        nn.Softmax(dim=1)
    )
)
print(net)

optimizer = torch.optim.Adam(net.parameters(), lr=0.001)
loss = nn.CrossEntropyLoss()

do_train(net, data_loader, optimizer, loss, 200)

P1_Net(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=153, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=256, bias=True)
    (5): ReLU()
    (6): Linear(in_features=256, out_features=2, bias=True)
    (7): Softmax(dim=1)
  )
)
[1] loss: 0.652
[2] loss: 0.609
[3] loss: 0.590
[4] loss: 0.583
[5] loss: 0.575
[6] loss: 0.568
[7] loss: 0.562
[8] loss: 0.556
[9] loss: 0.546
[10] loss: 0.545
[11] loss: 0.537
[12] loss: 0.548
[13] loss: 0.532
[14] loss: 0.540
[15] loss: 0.528
[16] loss: 0.522
[17] loss: 0.517
[18] loss: 0.521
[19] loss: 0.514
[20] loss: 0.512
[21] loss: 0.516
[22] loss: 0.519
[23] loss: 0.510
[24] loss: 0.497
[25] loss: 0.500
[26] loss: 0.498
[27] loss: 0.495
[28] loss: 0.487
[29] loss: 0.479
[30] loss: 0.483
[31] loss: 0.490
[32] loss: 0.469
[33] loss: 0.461
[34] loss: 0.477
[35] loss: 0.466
[36] loss: 0.454
[37] loss: 0.452
[38] loss: 0.

In [27]:
from sklearn.metrics import roc_auc_score
net.eval()
print("* train")
with torch.no_grad():
    y_pred1 = net(torch.from_numpy(X_train.toarray().astype('float32')))[:,1]
print(f"\t* ROC_AUC: {roc_auc_score(y_train, y_pred1)}")

print("* test")
with torch.no_grad():
    y_pred1 = net(torch.from_numpy(X_test.toarray().astype('float32')))[:,1]
print(f"\t* ROC_AUC: {roc_auc_score(y_test, y_pred1)}")

* train
	* ROC_AUC: 0.9235322304553445
* test
	* ROC_AUC: 0.7359713849721106


In [30]:
X_train, X_test, y_train, y_test = prepare_wind_data(all_cities_df)
X_train.shape
y_0 = np.array([1 if i==0 else 0 for i in y_train])
y_t = np.concatenate([np.expand_dims(y_0, axis=1),np.expand_dims(y_train, axis=1)],axis=1)
dataset = torch.utils.data.TensorDataset(
    torch.from_numpy(X_train.toarray().astype('float32')), 
    torch.from_numpy(y_t.astype('float32'))
)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=512, shuffle=False)
net = P1_Net(
    nn.Sequential(
        nn.Linear(X_train.shape[1], 256),
        nn.ReLU(),
        nn.Dropout(p=0.5),
        nn.Linear(256,2),
        nn.Softmax(dim=1)
    )
)
print(net)

optimizer = torch.optim.Adam(net.parameters(), lr=0.001)
loss = nn.CrossEntropyLoss()

do_train(net, data_loader, optimizer, loss, 200)

P1_Net(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=153, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=256, out_features=2, bias=True)
    (4): Softmax(dim=1)
  )
)
[1] loss: 0.643
[2] loss: 0.608
[3] loss: 0.595
[4] loss: 0.587
[5] loss: 0.581
[6] loss: 0.576
[7] loss: 0.572
[8] loss: 0.568
[9] loss: 0.565
[10] loss: 0.562
[11] loss: 0.559
[12] loss: 0.557
[13] loss: 0.554
[14] loss: 0.551
[15] loss: 0.550
[16] loss: 0.548
[17] loss: 0.545
[18] loss: 0.545
[19] loss: 0.543
[20] loss: 0.542
[21] loss: 0.541
[22] loss: 0.539
[23] loss: 0.537
[24] loss: 0.536
[25] loss: 0.535
[26] loss: 0.534
[27] loss: 0.533
[28] loss: 0.531
[29] loss: 0.531
[30] loss: 0.529
[31] loss: 0.529
[32] loss: 0.527
[33] loss: 0.526
[34] loss: 0.526
[35] loss: 0.525
[36] loss: 0.524
[37] loss: 0.523
[38] loss: 0.522
[39] loss: 0.521
[40] loss: 0.521
[41] loss: 0.520
[42] loss: 0.521
[43] loss: 0.519
[44] loss: 0.518
[45] loss: 

In [31]:
from sklearn.metrics import roc_auc_score
net.eval()
print("* train")
with torch.no_grad():
    y_pred1 = net(torch.from_numpy(X_train.toarray().astype('float32')))[:,1]
print(f"\t* ROC_AUC: {roc_auc_score(y_train, y_pred1)}")

print("* test")
with torch.no_grad():
    y_pred1 = net(torch.from_numpy(X_test.toarray().astype('float32')))[:,1]
print(f"\t* ROC_AUC: {roc_auc_score(y_test, y_pred1)}")

* train
	* ROC_AUC: 0.8920216206609455
* test
	* ROC_AUC: 0.7655038960096339


In [34]:
X_train, X_test, y_train, y_test = prepare_wind_data(all_cities_df)
X_train.shape
y_0 = np.array([1 if i==0 else 0 for i in y_train])
y_t = np.concatenate([np.expand_dims(y_0, axis=1),np.expand_dims(y_train, axis=1)],axis=1)
dataset = torch.utils.data.TensorDataset(
    torch.from_numpy(X_train.toarray().astype('float32')), 
    torch.from_numpy(y_t.astype('float32'))
)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=512, shuffle=False)
net = P1_Net(
    nn.Sequential(
        nn.Linear(X_train.shape[1], 256),
        nn.ReLU(),
        nn.Linear(256,128),
        nn.ReLU(),
        nn.Linear(128,64),
        nn.ReLU(),
        nn.Linear(64,2),
        nn.Softmax(dim=1)
    )
)
print(net)

optimizer = torch.optim.Adam(net.parameters(), lr=0.001)
loss = nn.CrossEntropyLoss()

do_train(net, data_loader, optimizer, loss, 200)

P1_Net(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=153, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=64, bias=True)
    (5): ReLU()
    (6): Linear(in_features=64, out_features=2, bias=True)
    (7): Softmax(dim=1)
  )
)
[1] loss: 0.654
[2] loss: 0.610
[3] loss: 0.594
[4] loss: 0.585
[5] loss: 0.577
[6] loss: 0.570
[7] loss: 0.564
[8] loss: 0.558
[9] loss: 0.557
[10] loss: 0.547
[11] loss: 0.552
[12] loss: 0.541
[13] loss: 0.549
[14] loss: 0.537
[15] loss: 0.537
[16] loss: 0.527
[17] loss: 0.529
[18] loss: 0.528
[19] loss: 0.532
[20] loss: 0.523
[21] loss: 0.512
[22] loss: 0.515
[23] loss: 0.520
[24] loss: 0.514
[25] loss: 0.508
[26] loss: 0.507
[27] loss: 0.505
[28] loss: 0.501
[29] loss: 0.485
[30] loss: 0.500
[31] loss: 0.493
[32] loss: 0.497
[33] loss: 0.481
[34] loss: 0.475
[35] loss: 0.478
[36] loss: 0.491
[37] loss: 0.480
[38] loss: 0.46

In [35]:
from sklearn.metrics import roc_auc_score
net.eval()
print("* train")
with torch.no_grad():
    y_pred1 = net(torch.from_numpy(X_train.toarray().astype('float32')))[:,1]
print(f"\t* ROC_AUC: {roc_auc_score(y_train, y_pred1)}")

print("* test")
with torch.no_grad():
    y_pred1 = net(torch.from_numpy(X_test.toarray().astype('float32')))[:,1]
print(f"\t* ROC_AUC: {roc_auc_score(y_test, y_pred1)}")

* train
	* ROC_AUC: 0.9165502355096555
* test
	* ROC_AUC: 0.7293944054175762
