## Analysis
- Seasons of interest - June, July, August, September.
- Districts of interest - Kolhapur, Latur

- A new LSTM model to predict rainfall.

### Import libraries

In [2]:
import warnings
warnings.filterwarnings('ignore')

import os
import shutil
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.graph_objs as go
import plotly.offline as py
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
import matplotlib.patches as mpatches 
from matplotlib.collections import PatchCollection
import plotly.figure_factory as ff
from IPython.display import HTML, display
from IPython.core import display as ICD
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

import Artificial_Neural_Networks as ANN
import ARIMA

import math
from itertools import groupby
%matplotlib inline
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.utils import plot_model
from keras.layers import Conv2D, MaxPooling2D, Flatten
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model, Sequential
from keras.layers.merge import concatenate
from keras.callbacks import ModelCheckpoint
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from keras.models import model_from_json

from importlib import reload
import itertools

### Useful functions

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def root_mean_squared_error(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    return rmse

def calculate_performance(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    return round(mse, 3), round(mae, 3), round(mape, 3), round(rmse, 3)

### Dataset

In [None]:
PATH = 'Dataset/rainfall_data_1901_to_2002.xlsx'
data = pd.read_excel(PATH)

### Preprocess data

In [None]:
data = data.drop(columns='vlookup')
data = data[data['Year'].notnull()]
data['Year'] = data.Year.astype('int')
data.index = range(len(data))

m_data = data[data['State'] == 'Maharashtra']
m_data = m_data.drop(columns='State')

districts = m_data.District.unique()
years = list(range(1901, 2003))
months = data.columns[3:]
year_month = [str(year) + '_' + month for year in years for month in months]
dates = pd.date_range(start='1901-01', freq='MS', periods=len(years)*12)

maharashtra_data = pd.DataFrame({'Year_Month': year_month})
maharashtra_data['Date'] = dates
maharashtra_data[['Year', 'Month']] = maharashtra_data['Year_Month'].str.split('_', n=1, expand=True)
maharashtra_data = maharashtra_data.drop(columns=['Year_Month'])

for district in districts:
    df = m_data[m_data.District == district].drop(columns=['District', 'Year'])
    df = df.as_matrix().reshape((len(years) * len(months), 1))[:,0]
    maharashtra_data[district] = df

maharashtra_data.head()

In [None]:
m_data = maharashtra_data.copy()

In [None]:
m_data.head()

### Useful functions

In [None]:
def get_combinations(parameters):
    return list(itertools.product(*parameters))

In [None]:
def get_latest_file(path):
    files = os.listdir(path)
    paths = [os.path.join(path, basename) for basename in files]
    return max(paths, key=os.path.getctime)

In [None]:
def LSTM_model(num_of_previous_months, hidden_nodes_months, 
               num_of_previous_years, hidden_nodes_years, output_nodes):
    
    visible1 = Input((num_of_previous_months, 1))
#     visible1 = Input((1, num_of_previous_months))
    extract1 = LSTM(hidden_nodes_months, activation='relu')(visible1)

    visible2 = Input((num_of_previous_years, 1))
#     visible2 = Input((1, num_of_previous_years))
    extract2 = LSTM(hidden_nodes_years, activation='relu')(visible2)

    merge = concatenate([extract1, extract2])
    output = Dense(output_nodes)(merge)
    
    model = Model(inputs = [visible1, visible2], outputs = output)
    model.compile(loss='mean_squared_error', optimizer='adam')
    
    plot_model(model, 'Functional_LSTM.png', show_shapes=True, show_layer_names=True)
    
    return model

In [None]:
def preprocess_data(m_data, district, month, num_of_prev_months, num_of_prev_years):
    
#     rainfall_season_data = m_data[['Date', 'Year', 'Month'] + districts_of_interest]
    rainfall_data = m_data[['Date', 'Year', 'Month', district]]
    month_data = rainfall_data[rainfall_data.Month == month]
    
    start_year = int(rainfall_data.Year.min())
    last_year = int(rainfall_data.Year.max())
    current_year = start_year + num_of_previous_years
    month_data_index = month_data.index
    
    train_data_input_1 = []
    for index in month_data_index[num_of_previous_years:]:
        data = list(rainfall_data.iloc[index - num_of_previous_months:index][district])
        train_data_input_1.append(data)
    train_data_input_1 = np.array(train_data_input_1)
    shape = train_data_input_1.shape
    train_data_input_1 = train_data_input_1.reshape(shape[0], shape[1], 1)
#     train_data_input_1 = train_data_input_1.reshape(shape[0], 1, shape[1])
    
    month_data_prep = list(month_data[district])
    train_data_input_2 = []
    for i in range(0, len(month_data_prep) - num_of_previous_years):
        data = month_data_prep[i:i+num_of_previous_years]
        train_data_input_2.append(data)
    train_data_input_2 = np.array(train_data_input_2)
    shape = train_data_input_2.shape
    train_data_input_2 = train_data_input_2.reshape(shape[0], shape[1], 1)
#     train_data_input_2 = train_data_input_2.reshape(shape[0], 1, shape[1])
    
    y_train = list(month_data.iloc[num_of_previous_years:][district])
    y_train = np.array(y_train)
    y_train = np.reshape(y_train, (y_train.shape[0], 1))
    
    return train_data_input_1, train_data_input_2, y_train

In [None]:
def split_and_train_LSTM(model, input_1, input_2, y_train_main, future_steps, epochs, batch_size):
    X_train_input_1, X_test_input_1, y_train, y_test = train_test_split(input_1, y_train_main, 
                                                                    test_size=future_steps, random_state=42)
    X_train_input_2, X_test_input_2, y_train, y_test = train_test_split(input_2, y_train_main, 
                                                                    test_size=future_steps, random_state=42)
    
    print(X_train_input_1.shape)
    print(X_train_input_2.shape)
    
    SOURCE_PATH = 'Models/'
    checkpoint = ModelCheckpoint(SOURCE_PATH + 'model-{epoch:03d}.h5', verbose=1, monitor='val_loss',save_best_only=True, mode='auto')  
    
    model.fit([X_train_input_1, X_train_input_2], y_train, epochs=epochs, batch_size=batch_size, verbose=1, shuffle=True, validation_split=0.1, callbacks=[checkpoint])
    
    file_name = get_latest_file(SOURCE_PATH)
    model.load_weights(file_name)
    os.system('rm -rf %s/*' % SOURCE_PATH)
    
    return model, X_test_input_1, X_test_input_2, y_test

In [None]:
def predict_LSTM(model, X_test_input_1, X_test_input_2):
    y_pred = model.predict([X_test_input_1, X_test_input_2])
    return y_pred

In [None]:
def Long_Short_Term_Memory(data, district, month, num_of_prev_months, num_of_prev_years, hidden_nodes_months, hidden_nodes_years, epochs, batch_size, future_steps):
    model = LSTM_model(num_of_prev_months, hidden_nodes_months, num_of_prev_years, hidden_nodes_years, output_nodes)
    train_data_input_1, train_data_input_2, y_train_main = preprocess_data(m_data, district, month, num_of_prev_months, num_of_prev_years)
    model, X_test_input_1, X_test_input_2, y_test = split_and_train_LSTM(model, train_data_input_1, train_data_input_2, y_train_main, future_steps, epochs, batch_size)
    
    y_pred = predict_LSTM(model, X_test_input_1, X_test_input_2)
    return model, y_test, y_pred

In [None]:
def find_best_method_LSTM(parameters_LSTM, m_data, district, month):
    combination_of_params = get_combinations(parameters_LSTM)
    information_LSTM = []
    iterator = 0
    
    for param in combination_of_params:
        if (iterator+1) != len(combination_of_params):
            print(iterator+1, end=' -> ')
        else:
            print(iterator+1)
        iterator = iterator+1
        
        num_of_previous_months = param[0]
        num_of_previous_years = param[1]
        hidden_nodes_months = param[2]
        hidden_nodes_years = param[3]
        epochs = param[4]
        batch_size = param[5]
        future_steps = param[6]
        
        model_LSTM, y_test, y_pred = Long_Short_Term_Memory(m_data, district, month, num_of_previous_months, num_of_previous_years,
                  hidden_nodes_months, hidden_nodes_years, epochs, batch_size, future_steps)
        
        mse, mae, mape, rmse = calculate_performance(y_test, y_pred)
        
        info = list(param) + [mse, mae, rmse, mape] + y_pred
        information_LSTM.append(info)
        
    information_LSTM_df = pd.DataFrame(information_LSTM)
    indexes = [str(i) for i in list(range(1, future_steps+1))]
    information_LSTM_df.columns = ['Num of previous months', 'Num of previous years', 'Hidden nodes months', 
                                   'Hidden nodes years', 'epochs', 'batch_size', 'future_steps', 
                                   'MSE', 'MAE', 'RMSE', 'MAPE'] + indexes
    return information_LSTM_df

In [None]:
future_steps = 10

# number_of_previous_months, number_of_previous_years, hidden_nodes_months, hidden_nodes_years, epochs, batch_size, future_steps
parameters_LSTM = [[4,8,12], [4,6,8], [6,8,10,12], [5,7,8], [250], [10], [future_steps]]
parameters_LSTM = [[4], [4], [6], [5], [250], [10], [future_steps]]

In [None]:
# model, y_pred = Long_Short_Term_Memory(m_data, district, month, num_of_previous_months, num_of_previous_years,
#                       hidden_nodes_months, hidden_nodes_years, 250, 10, future_steps)

In [None]:
districts_of_interest = ['Kolhapur', 'Latur']
months_of_interest = ['Jun', 'Jul', 'Aug', 'Sep']

In [None]:
STORAGE_FOLDER = 'State_predictions_new_LSTM'
for district in districts_of_interest:
    for month in months_of_interest:
        find_best_method_LSTM(parameters_LSTM, m_data, district, month)
        break
    break