In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from pandas.plotting import register_matplotlib_converters

%matplotlib inline
%config InlineBackend.figure_format='retina'

register_matplotlib_converters()
sns.set(style='whitegrid', palette='muted', font_scale=1.5)

rcParams['figure.figsize'] = 22, 10

RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv("/content/drive/MyDrive/test/train.csv")

In [None]:
df.shape

(527040, 17)

In [None]:
df.head()

Unnamed: 0,DATE (MM/DD),MST,Global CMP22 (vent/cor) [W/m^2],Direct sNIP [W/m^2],Azimuth Angle [degrees],Tower Dry Bulb Temp [deg C],Tower Wet Bulb Temp [deg C],Tower Dew Point Temp [deg C],Tower RH [%],Total Cloud Cover [%],Peak Wind Speed @ 6ft [m/s],Avg Wind Direction @ 6ft [deg from N],Station Pressure [mBar],Precipitation (Accumulated) [mm],Snow Depth [cm],Moisture,Albedo (CMP11)
0,1/1,00:00,-0.962276,0.0,356.8564,7.216,0.988,-7.312,32.33,-1,9.95,271.3,806.779,0.0,0.219,0.0,0.0
1,1/1,00:01,-0.937921,0.0,357.65505,7.251,1.04,-7.26,32.4,-1,8.2,272.9,806.84,0.0,0.206,0.0,0.0
2,1/1,00:02,-0.944395,0.0,358.45438,7.256,1.093,-7.207,32.54,-1,6.7,288.8,806.876,0.0,0.148,0.0,0.0
3,1/1,00:03,-0.95135,-0.029673,359.25416,7.254,1.06,-7.44,31.89,-1,7.7,294.0,806.823,0.0,0.235,0.0,0.0
4,1/1,00:04,-0.934976,-0.054401,0.05415,7.331,1.081,-7.419,31.78,-1,7.2,285.5,806.762,0.0,0.182,0.0,0.0


In [None]:
df = df[df['Total Cloud Cover [%]'] >= 0]

In [None]:
#df.corr()

In [None]:
df.rename(columns = {'Direct sNIP [W/m^2]':'snip', 'Total Cloud Cover [%]': 'cloud cover', 
                     'Peak Wind Speed @ 6ft [m/s]': 'wind speed', 
                     'Avg Wind Direction @ 6ft [deg from N]': 'wind dir',
                     'Tower RH [%]': 'rh', 'Global CMP22 (vent/cor) [W/m^2]': 'cmp22',
                     'Tower Dew Point Temp [deg C]': 'temp', 'Moisture': 'moisture',
                     }, inplace = True)

In [None]:
req_cols = ['snip', 'wind speed', 'wind dir', 'rh', 'cmp22', 'temp', 'moisture', 'cloud cover']

df = df[req_cols]

df.head()

Unnamed: 0,snip,wind speed,wind dir,rh,cmp22,temp,moisture,cloud cover
467,0.079127,0.7,210.3,38.24,14.4935,-5.597,0.0,6
468,-0.034618,0.7,232.2,38.58,16.5178,-5.606,0.0,8
469,-0.153309,1.7,263.9,39.12,18.6171,-5.54,0.0,10
470,-0.143419,2.2,277.5,39.35,20.5986,-5.494,0.0,13
471,-0.079128,2.2,233.4,39.64,22.4362,-5.384,0.0,15


In [None]:
df.shape

(250731, 8)

In [None]:
train_size = int(len(df) * 0.9)
test_size = len(df) - train_size
train, test = df.iloc[0:train_size], df.iloc[train_size:len(df)]
print(len(train), len(test))

225657 25074


# Preprocessing

In [None]:
df.head()

Unnamed: 0,snip,wind speed,wind dir,rh,cmp22,temp,moisture,cloud cover
467,0.079127,0.7,210.3,38.24,14.4935,-5.597,0.0,6
468,-0.034618,0.7,232.2,38.58,16.5178,-5.606,0.0,8
469,-0.153309,1.7,263.9,39.12,18.6171,-5.54,0.0,10
470,-0.143419,2.2,277.5,39.35,20.5986,-5.494,0.0,13
471,-0.079128,2.2,233.4,39.64,22.4362,-5.384,0.0,15


In [None]:
from sklearn.preprocessing import RobustScaler

f_columns = ['snip', 'wind speed', 'wind dir', 'rh', 'cmp22', 'temp', 'moisture']

f_transformer = RobustScaler()
cnt_transformer = RobustScaler()

f_transformer = f_transformer.fit(train[f_columns].to_numpy())
cnt_transformer = cnt_transformer.fit(train[['cloud cover']])

train.loc[:, f_columns] = f_transformer.transform(train[f_columns].to_numpy())
train['cloud cover'] = cnt_transformer.transform(train[['cloud cover']])

test.loc[:, f_columns] = f_transformer.transform(test[f_columns].to_numpy())
test['cloud cover'] = cnt_transformer.transform(test[['cloud cover']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = valu

In [None]:
train['snip']

467      -0.620828
468      -0.620959
469      -0.621095
470      -0.621084
471      -0.621010
            ...   
458525   -0.576519
458526   -0.553941
458527   -0.549056
458528   -0.608804
458529   -0.609406
Name: snip, Length: 225657, dtype: float64

In [None]:
def create_dataset(X, y, time_steps):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        v = X.iloc[i:(i + time_steps)].values
        Xs.append(v)        
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)

In [None]:
time_steps = 120

# reshape to [samples, time_steps, n_features]

X_train, y_train = create_dataset(train, train['cloud cover'], time_steps)
X_test, y_test = create_dataset(test, test['cloud cover'], time_steps)

print(X_train.shape, y_train.shape)

(225537, 120, 8) (225537,)


In [None]:
model = keras.Sequential()
model.add(
  keras.layers.Bidirectional(
    keras.layers.LSTM(
      units=64, 
      input_shape=(X_train.shape[1], X_train.shape[2]),
      return_sequences=True
    )
  )
)
model.add(keras.layers.Dropout(rate=0.1))

model.add(
    keras.layers.LSTM(
      units=16, 
      input_shape=(X_train.shape[1], X_train.shape[2])
    )
)
model.add(keras.layers.Dropout(rate=0.1))

model.add(keras.layers.Dense(units=1))
model.compile(loss='mean_squared_error', optimizer='adam')

In [None]:
history = model.fit(
    X_train, y_train, 
    epochs=50, 
    batch_size=360, 
    validation_split=0.1,
    shuffle=False
)

Epoch 1/50


In [None]:
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend();

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_train_inv = cnt_transformer.inverse_transform(y_train.reshape(1, -1))
y_test_inv = cnt_transformer.inverse_transform(y_test.reshape(1, -1))
y_pred_inv = cnt_transformer.inverse_transform(y_pred)

In [None]:
'''
plt.plot(np.arange(0, len(y_train)), y_train_inv.flatten(), 'g', label="history")
plt.plot(np.arange(len(y_train), len(y_train) + len(y_test)), y_test_inv.flatten(), marker='.', label="true")
plt.plot(np.arange(len(y_train), len(y_train) + len(y_test)), y_pred_inv.flatten(), 'r', label="prediction")
plt.ylabel('cloud cover')
plt.xlabel('Time Step')
plt.legend()
plt.show();
'''

In [None]:
plt.plot(y_test_inv.flatten(), marker='.', label="true")
plt.plot(y_pred_inv.flatten(), 'r', label="prediction")
plt.ylabel('cloud cover')
plt.xlabel('Time Step')
plt.legend()
plt.show();

In [None]:
temp = 300

vals = np.zeros(temp)

for i in range(1,temp+1):
  df_test = pd.read_csv('/content/drive/MyDrive/test/{}/weather_data.csv'.format(i))
  df_test = df_test[df_test['Total Cloud Cover [%]'] >= 0]

  df_test.rename(columns = {'Direct sNIP [W/m^2]':'snip', 'Total Cloud Cover [%]': 'cloud cover', 
                     'Peak Wind Speed @ 6ft [m/s]': 'wind speed', 
                     'Avg Wind Direction @ 6ft [deg from N]': 'wind dir',
                     'Tower RH [%]': 'rh', 'Global CMP22 (vent/cor) [W/m^2]': 'cmp22',
                     'Tower Dew Point Temp [deg C]': 'temp', 'Moisture': 'moisture',}, inplace = True)
  req_cols = ['snip', 'wind speed', 'wind dir', 'rh', 'cmp22', 'temp', 'moisture', 'cloud cover']
  df_test = df_test[req_cols]

  df_test.loc[:, f_columns] = f_transformer.transform(df_test[f_columns].to_numpy())
  df_test['cloud cover'] = cnt_transformer.transform(df_test[['cloud cover']])

  X_test, y_test = create_dataset(df_test, df_test['cloud cover'], time_steps)
  
  y_pred = model.predict(X_test)

  y_test_inv = cnt_transformer.inverse_transform(y_test.reshape(1, -1))
  y_pred_inv = cnt_transformer.inverse_transform(y_pred)

  vals[i-1] = y_pred_inv[-1]
  print(i, end=' ')

In [None]:
vals.shape

In [None]:
pd.DataFrame(vals).to_csv("/content/drive/MyDrive/test/datanew{}.csv".format(time_steps))

In [None]:
plt.plot(np.arange(0, y_test_inv.shape[1]) , y_test_inv[0], 'r', np.arange(0, y_test_inv.shape[1]), y_pred_inv, 'b')
plt.show()

In [None]:
vals