## 1.0 Data Preparation

### 1.1 Dependancies

In [None]:
# !pip install pivottablejs
# !pip install missingno
# !pip install imblearn
# !pip install pivottablejs

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### 1.2 Reading the Dataset

In [None]:
weather = pd.read_csv('All_Year_Data.csv' , index_col = 'date')
weather.head(5)

In [None]:
weather.tail(5)

## 2.0 Data Preprocessing

### 2.1 Dataset Information

In [None]:
weather.info()

In [None]:
weather.dtypes

In [None]:
weather.index

In [None]:
weather.index = pd.to_datetime(weather.index)
weather.index

### 2.2 Dataset Summary Statistics

In [None]:
weather.describe().style.background_gradient()

### 2.3 Unique Values Counts in all Atributes

In [None]:
weather.nunique().sort_values(ascending = True)

In [None]:
for col in weather.columns:
    print(col, len(weather[col].unique()), weather[col].unique())

### 2.4 Checking Missing Values

In [None]:
weather.isnull().sum()

In [None]:
pd.options.display.float_format = '{:,.2f}%'.format
print((weather.isnull().sum()/len(weather))*100)
pd.options.display.float_format = '{:,.2f}'.format

In [None]:
# weather.apply(pd.isnull).sum( )/weather.shape[0]

## 3.0 Data Visualization

### 3.1 Plot for missing values

In [None]:
import missingno as msno
msno.matrix(weather, labels = [weather.columns], figsize = (20,6), fontsize = 12)

### 3.2 Understanding the Distruction of the Data

In [None]:
plt.figure(figsize = (15,15))
for i, col in enumerate(weather.select_dtypes(include = ['int','float']).columns):
    plt.rcParams['axes.facecolor'] = 'white'
    ax = plt.subplot(4,3, i+1)
    sns.histplot(data = weather, x = col, ax = ax, color = 'green', kde = True)
plt.suptitle('Data distribution for all the Columns in the data set', fontsize = 18)
plt.tight_layout()

### 3.3 Box Plot

In [None]:
plt.figure(figsize = (15,15))
for i, col in enumerate(weather.select_dtypes(include = ['int','float']).columns):
    plt.rcParams['axes.facecolor'] = 'white'
    ax = plt.subplot(4,3, i+1)
    sns.boxplot(data = weather, x=col, ax=ax, color='green')
plt.suptitle('Data distribution')
plt.tight_layout()

### 3.4 Heatmap showing the correlation of the data

In [None]:
plt.figure(figsize = (15,10))
sns.heatmap(weather.select_dtypes(include = ['int', 'float']).corr(), annot=True, center= 0)
plt.show()

### 3.5 Analysis using Pivot Tables (JS)

In [None]:
from pivottablejs import pivot_ui
pivot_ui(weather)

### 3.6 Line plot data visualization

### 3.6.1 Renaming the columns

In [None]:
weather = weather[["tmax_C","tmin_C","rain_mm","hum%_9AM","hum%_3PM","windsp_9AM","windsp_3PM","winddir_3PM"]].copy()
weather.columns = ["Temp_max","Temp_min","Rainfall","Humid_9AM","Humid_3PM","Wind_9AM","Wind_3PM","Wind_direction"]
weather

### 3.6.2 Temperature plot

In [None]:
weather[["Temp_max","Temp_min"]].plot(figsize=(15,6), title='Temperature')
plt.show()

### 3.6.3 Rainfall plot

In [None]:
weather[["Rainfall"]].plot(figsize=(15, 6), title='Rainfall')
plt.show()

### 3.6.3.1 Normalizing the data
- This is essential incase the dataset doesn't follow a timeseries pattern or being more of bar plot. This function can help in normalizing thedata to follow a time series. But for this particular senario it's not important

In [None]:
# weather['Rainfall'] = np.log(weather['Rainfall'])
# weather

In [None]:
# weather['Rainfall'].plot(figsize = (20,6))

In [None]:
weather.groupby(weather.index.month).sum()["Rainfall"]

### 3.6.4 Wind speed plot

In [None]:
weather[["Wind_9AM","Wind_3PM"]].plot(figsize = (15,6), title = 'Wind Speed')
plt.show()

### 3.6.4.1 Filtering and cleaning the outliner in the data (Wind_9Am = -3)

In [None]:
weather.loc["2019-09-01":"2019-09-15",:]

In [None]:
weather.query('Wind_9AM < 0')['Wind_9AM'] \
    .plot(style = '.', figsize=(15, 5), title='Wind Speed Outliers')

- Distribution check showing the outlier -3

In [None]:
plt.rcParams['axes.facecolor'] = 'white'
sns.countplot(x='Wind_9AM', data=weather)

### 3.6.4.2 Replacing the value with +3
- I chose +3 because, looking at the data in column closely, I noticed from 13/09/2019 - 15/09/2019, the pattern seems to be repeating with that from 16/09/2019 - 18/09/2019. So This might have been a mistake.

In [None]:
weather['Wind_9AM'].loc['2019-09-11'] = 3


In [None]:
weather.query('Wind_9AM < 0')['Wind_9AM'] \
    .plot(style = '.', figsize=(15, 5), title='Wind Speed Outliers')

In [None]:
weather[["Wind_9AM","Wind_3PM"]].plot(figsize = (15,6), title = 'Wind Speed')
plt.show()

In [None]:
weather.loc["2019-09-05":"2019-09-15",:]

In [None]:
plt.rcParams['axes.facecolor'] = 'white'
sns.countplot(x='Wind_9AM', data=weather)

### 3.6.5 Humidity plot

In [None]:
weather[['Humid_9AM', 'Humid_3PM']].plot(figsize = (15,6), title = 'Humidity')

### 3.6.6 Wind direction plot

In [None]:
weather[['Wind_direction']].plot(figsize = (15,6), title = 'Wind Direction')

In [None]:
# weatherdf = pd.get_dummies(weather)

## 4.0 Modelling

### 4.1 Import Dependencies

In [None]:
X = weather.drop('Temp_max', axis=1)
y = weather['Temp_max']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=1234)

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

### 4.2 Dependent Algorithms

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

### 4.3 Preparing the pipeline for tuning models

In [None]:
pipelines = {
    'rf':make_pipeline(RandomForestRegressor(random_state=1234)),
    'gb':make_pipeline(GradientBoostingRegressor(random_state=1234)),
    'ridge':make_pipeline(Ridge(random_state=1234)),
    'lasso':make_pipeline(Lasso(random_state=1234)),
    'enet':make_pipeline(ElasticNet(random_state=1234)),
}

### 4.4 Hyperparameter tuning using Grid-Search-Cross-Validation

In [None]:
RandomForestRegressor().get_params()

In [None]:
hypergrid = {
    'rf': {
        'randomforestregressor__min_samples_split':[2,4,6],
        'randomforestregressor__min_samples_leaf':[1,2,3]
    },
    'gb':{
        'gradientboostingregressor__alpha':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.99]
    },
    'ridge':{
        'ridge__alpha':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.99]
    },
    'lasso':{
        'lasso__alpha':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.99]
    },
    'enet':{
        'elasticnet__alpha':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.99]
    }
}

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.exceptions import NotFittedError

In [None]:
fit_models = {}
for algo, pipeline in pipelines.items():
    model = GridSearchCV(pipeline, hypergrid[algo], cv=10, n_jobs=-1)
    try:
        print('Starting training for {}.'.format(algo))
        model.fit(X_train, y_train)
        fit_models[algo] = model
        print('{} has been successfully fit.'.format(algo))
    except NotFittedError as e:
        print(repr(e))

In [None]:
# fit_models['ridge'].predict(X_test)

## 5.0 Evaluation and Choosing BestFit model

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error

In [None]:
for algo,model in fit_models.items():
    yhat = model.predict(X_test)
    print('{} scores - R2: {}  MAE: {}'.format(algo, r2_score(y_test, yhat), mean_absolute_error(y_test, yhat)))

#### Best fit model is Gradient Boosting Regressor because;
- It has the least mean absolute error (MAE) value - 0.90
- Highest accuracy (R2 - value) - 71.5%

In [None]:
best_model = fit_models['gb']
best_model

## 6.0 Forcasting Predictions

In [None]:
yhat = fit_models['gb'].predict(X_test)
yhat

In [None]:
type(yhat)

In [None]:
predictions = pd.DataFrame(yhat, columns = ['Temp_pred'])

predictions.tail(5)

In [None]:
y_test.tail(5)

In [None]:
type(y_test)

In [None]:
type(predictions)

In [None]:
y_test = pd.DataFrame(y_test, columns = ['Temp_max'])
y_test
type(y_test)

## 7.0 Saving, Deleting and Loading the BestFitModel

In [None]:
import pickle
with open('BestFit_model.pkl', "wb") as f:
    pickle.dump(best_model, f)

In [None]:
del best_model

In [None]:
with open('BestFit_model.pkl', "rb") as f:
    best_model = pickle.load(f)