### Import libraries

In [2]:
import warnings
warnings.filterwarnings('ignore')

import os
import shutil
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.graph_objs as go
import plotly.offline as py
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
import matplotlib.patches as mpatches 
from matplotlib.collections import PatchCollection
import plotly.figure_factory as ff
from IPython.display import HTML, display
from IPython.core import display as ICD
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

import Artificial_Neural_Networks as ANN
import ARIMA

import math
from itertools import groupby
%matplotlib inline
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

from importlib import reload
import itertools

In [3]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def root_mean_squared_error(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    return rmse

def calculate_performance(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    return round(mse, 3), round(mae, 3), round(mape, 3), round(rmse, 3)

### Dataset

In [5]:
PATH = 'Dataset/rainfall_data_1901_to_2002.xlsx'
data = pd.read_excel(PATH)

### Preprocess data

In [6]:
data = data.drop(columns='vlookup')
data = data[data['Year'].notnull()]
data['Year'] = data.Year.astype('int')
data.index = range(len(data))

m_data = data[data['State'] == 'Maharashtra']
m_data = m_data.drop(columns='State')

districts = m_data.District.unique()
years = list(range(1901, 2003))
months = data.columns[3:]
year_month = [str(year) + '_' + month for year in years for month in months]
dates = pd.date_range(start='1901-01', freq='MS', periods=len(years)*12)

maharashtra_data = pd.DataFrame({'Year_Month': year_month})
maharashtra_data['Date'] = dates
maharashtra_data[['Year', 'Month']] = maharashtra_data['Year_Month'].str.split('_', n=1, expand=True)
maharashtra_data = maharashtra_data.drop(columns=['Year_Month'])

for district in districts:
    df = m_data[m_data.District == district].drop(columns=['District', 'Year'])
    df = df.as_matrix().reshape((len(years) * len(months), 1))[:,0]
    maharashtra_data[district] = df

maharashtra_data.head()

Unnamed: 0,Date,Year,Month,Ahmadnagar,Akola,Amravati,Aurangabad,Bhandara,Bid,Buldana,...,Nashik,Osmanabad,Parbhani,Pune,Sangli,Satara,Solapur,Wardha,Washim,Yavatmal
0,1901-01-01,1901,Jan,2.51,34.202,35.651,10.922,23.397,16.647,31.455,...,5.063,15.5,33.207,0.922,0.138,0.197,4.496,45.05,32.485,22.031
1,1901-02-01,1901,Feb,11.489,1.099,6.822,4.362,63.844,1.916,0.823,...,1.609,2.784,4.997,7.195,0.537,0.525,3.667,28.22,3.607,16.198
2,1901-03-01,1901,Mar,11.325,30.002,36.103,25.161,33.563,27.287,28.448,...,11.196,11.333,31.625,5.105,13.09,9.566,10.33,26.398,26.557,29.318
3,1901-04-01,1901,Apr,33.931,10.248,10.636,12.714,61.56,33.211,13.902,...,7.838,34.814,41.941,35.949,50.077,30.11,59.854,29.228,18.127,30.521
4,1901-05-01,1901,May,30.401,2.891,4.173,34.244,13.665,59.027,9.397,...,7.475,52.792,31.794,36.65,78.994,65.226,50.892,8.619,6.7,13.193


In [19]:
m_data = maharashtra_data.copy()

### Filter data
- Season - June, July, August, September.
- Kolhapur, Latur

In [32]:
districts_of_interest = ['Kolhapur', 'Latur']
months_of_interest = ['Jun', 'Jul', 'Aug', 'Sep']

In [33]:
rainfall_season_data = m_data[m_data.Month.isin(months_of_interest)]
rainfall_season_data = rainfall_season_data[['Date', 'Year', 'Month'] + districts_of_interest]

In [34]:
rainfall_season_data.head()

Unnamed: 0,Date,Year,Month,Kolhapur,Latur
5,1901-06-01,1901,Jun,554.047,188.878
6,1901-07-01,1901,Jul,496.636,175.092
7,1901-08-01,1901,Aug,507.657,138.084
8,1901-09-01,1901,Sep,221.539,102.949
17,1902-06-01,1902,Jun,631.349,46.72


In [74]:
for district in districts_of_interest:
    temp_data = rainfall_season_data[['Date', 'Year', 'Month', district]]
    for month in months_of_interest:
        df = temp_data[temp_data.Month == month]
        df.index = range(len(df))
        df = df[['Date', district]]
        dates = df.Date
        rainfall_data = pd.DataFrame({'Precipitation': df[district][:-5]})
        rainfall_data.index = dates[:-5]
        test_rainfall_data = pd.DataFrame({'Precipitation': df[district][-5:]})
        test_rainfall_data.index = dates[-5:]

In [78]:
rainfall_data.head()

Unnamed: 0_level_0,Precipitation
Date,Unnamed: 1_level_1
1901-09-01,102.949
1902-09-01,339.679
1903-09-01,321.717
1904-09-01,406.302
1905-09-01,198.545


In [76]:
test_rainfall_data.head()

Unnamed: 0_level_0,Precipitation
Date,Unnamed: 1_level_1
1998-09-01,273.964
1999-09-01,208.238
2000-09-01,89.69
2001-09-01,108.773
2002-09-01,89.07


In [79]:
df.head()

Unnamed: 0,Date,Latur
0,1901-09-01,102.949
1,1902-09-01,339.679
2,1903-09-01,321.717
3,1904-09-01,406.302
4,1905-09-01,198.545
