In [1]:
import pandas as pd
import datetime as dt
from datetime import datetime
import numpy as np
import seaborn as sns
import warnings
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn import metrics
import time
from bs4 import BeautifulSoup
import requests
import json
import random
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, roc_auc_score, recall_score, precision_score, average_precision_score, f1_score, classification_report, accuracy_score

warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
data_weather = pd.read_csv('weatherAUS.csv')

In [3]:
data_weather['Date'] = pd.to_datetime(data_weather['Date'], format='%Y-%m-%d')
data_weather = data_weather[data_weather['RainToday'].notna()]
data_weather = data_weather[data_weather['RainTomorrow'].notna()]

In [4]:
# Getting latitude&longitude of locations

locs = data_weather.Location.unique()
locs.sort()
api = "http://api.positionstack.com/v1/forward"
token = 'f898acba181f26412807e4fc0bf8c2ea'
locs_df = pd.DataFrame(locs, columns=['Location'])
locs_df['latitude'] = ''
locs_df['longitude'] = ''
for ind, row in locs_df.iterrows():
    try:
        query = row['Location'] + ', Australia'
        url = f'{api}?access_key={token}&query={query}&limit=1'
        page = requests.get(url)
        doc = BeautifulSoup(page.content, 'html.parser')
        json_dict = json.loads(str(doc))
        locs_df.loc[int(ind), 'latitude'] = str(json_dict['data'][0]['latitude'])
        locs_df.loc[int(ind), 'longitude'] = str(json_dict['data'][0]['longitude'])
    except:
        locs_df.loc[int(ind), 'latitude'] = '-33.865143'
        locs_df.loc[int(ind), 'longitude'] = '151.209900'


In [5]:
locs_df[:5]

Unnamed: 0,Location,latitude,longitude
0,Adelaide,-34.929075,138.602578
1,Albany,-35.031075,117.89494
2,Albury,-36.074771,146.914504
3,AliceSprings,-33.865143,151.2099
4,BadgerysCreek,-33.865143,151.2099


In [6]:
def str_to_num(data, col):
    mapdict = {}
    values = list(data[col].unique())
    for i in range(len(values)):
        mapdict[values[i]] = i
    return mapdict

def clean_data(data):
    n = len(data)
    data = data.drop(['Location'], axis = 1)
    for col in data.columns:
        b = round(data[col].isna().sum()/n,2)
        if (b>= 0.1):
            data = data.drop([col], axis = 1)
        else:
            try:
                data[col].fillna((data[col].mean()), inplace=True)
            except:
                pass
            #if (data[col].dtype == 'O'):
                #data[col].replace(str_to_num(data, col),inplace = True)
                #data = pd.get_dummies(df, columns=[col], drop_first=True)
    data = data.dropna()
    return data

In [7]:
## Combine All Code

In [8]:
def get_RFC(location):
    try:
        data = data_weather[data_weather['Location'] == location]
        df = clean_data(data)
        df.replace({'No':0, 'Yes':1}, inplace=True)
        df = pd.get_dummies(df, columns=['WindGustDir','WindDir9am','WindDir3pm'], drop_first=True)
        in_sample = df[df.Date.dt.year != 2017]
        out_of_sample = df[df.Date.dt.year == 2017]

        X=in_sample.drop(['RainTomorrow', 'Date'], axis = 1)
        y=in_sample['RainTomorrow']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        clf = LogisticRegression(penalty='l2') # logistic regression with no penalty term in the cost function.

        clf.fit(X_train, y_train)
        test_prob = clf.predict_proba(X_test)[:, 1]
        test_pred = clf.predict(X_test)
        test_error = round(metrics.accuracy_score(y_test, test_pred),2)


        X_out = out_of_sample.drop(['RainTomorrow', 'Date'], axis = 1)
        y_out = out_of_sample['RainTomorrow']

        y_fc = clf.predict(X_out)
        forecast_error = round(metrics.accuracy_score(y_out, y_fc),2)
        print(f'Test Accuracy for {location} is: {test_error}')
        print(f'Forecast Accuracy for {location} is: {forecast_error}')
        df_final = out_of_sample
        df_final['Forecast'] = y_fc
        df_final['FC_accuracy'] = forecast_error
        df_final['Test_accuracy'] = test_error
    except:
        pass
    
    return df_final

In [9]:
collect = []
for loc in tqdm(locs_df['Location']):
    try:     
        df_with_results = get_RFC(loc)
        df_with_results['Location'] = str(loc)
        df_with_results['Latitude'] = float(locs_df[locs_df['Location'] == loc]['latitude'].values)
        df_with_results['Longitude'] = float(locs_df[locs_df['Location'] == loc]['longitude'].values)
        df_with_results['Date'] = df_with_results.Date.dt.date
        df_with_results = df_with_results[['Date', 'Location', 'Latitude', 'Longitude', 'RainTomorrow', 'Forecast', 'FC_accuracy', 'Test_accuracy']]
        collect.append(df_with_results)
    except:
        pass

df_final_result = pd.concat(collect, axis=0, ignore_index=True).reset_index(drop = True)


  8%|███▌                                        | 4/49 [00:00<00:02, 15.58it/s]

Test Accuracy for Adelaide is: 0.89
Forecast Accuracy for Adelaide is: 0.89
Test Accuracy for AliceSprings is: 0.95
Forecast Accuracy for AliceSprings is: 0.94


 16%|███████▏                                    | 8/49 [00:00<00:02, 14.12it/s]

Test Accuracy for Ballarat is: 0.85
Forecast Accuracy for Ballarat is: 0.86
Test Accuracy for Bendigo is: 0.88
Forecast Accuracy for Bendigo is: 0.9
Test Accuracy for Brisbane is: 0.87
Forecast Accuracy for Brisbane is: 0.87


 24%|██████████▌                                | 12/49 [00:00<00:02, 15.89it/s]

Test Accuracy for Cairns is: 0.81
Forecast Accuracy for Cairns is: 0.75
Test Accuracy for Cobar is: 0.92
Forecast Accuracy for Cobar is: 0.95


 33%|██████████████                             | 16/49 [00:01<00:02, 14.52it/s]

Test Accuracy for Darwin is: 0.87
Forecast Accuracy for Darwin is: 0.87
Test Accuracy for GoldCoast is: 0.82
Forecast Accuracy for GoldCoast is: 0.71
Test Accuracy for Hobart is: 0.81
Forecast Accuracy for Hobart is: 0.88


 37%|███████████████▊                           | 18/49 [00:01<00:02, 15.47it/s]

Test Accuracy for Katherine is: 0.9
Forecast Accuracy for Katherine is: 0.89
Test Accuracy for Melbourne is: 0.83
Forecast Accuracy for Melbourne is: 0.83


 45%|███████████████████▎                       | 22/49 [00:01<00:02, 12.76it/s]

Test Accuracy for MelbourneAirport is: 0.85
Forecast Accuracy for MelbourneAirport is: 0.87
Test Accuracy for Mildura is: 0.92
Forecast Accuracy for Mildura is: 0.94
Test Accuracy for Moree is: 0.92
Forecast Accuracy for Moree is: 0.91


 53%|██████████████████████▊                    | 26/49 [00:01<00:01, 13.86it/s]

Test Accuracy for MountGambier is: 0.83
Forecast Accuracy for MountGambier is: 0.84
Test Accuracy for MountGinini is: 0.82
Forecast Accuracy for MountGinini is: 0.87
Test Accuracy for Nhil is: 0.89
Forecast Accuracy for Nhil is: 0.94


 57%|████████████████████████▌                  | 28/49 [00:02<00:01, 12.88it/s]

Test Accuracy for NorahHead is: 0.82
Forecast Accuracy for NorahHead is: 0.77
Test Accuracy for NorfolkIsland is: 0.79
Forecast Accuracy for NorfolkIsland is: 0.79
Test Accuracy for Nuriootpa is: 0.86
Forecast Accuracy for Nuriootpa is: 0.88


 65%|████████████████████████████               | 32/49 [00:02<00:01, 13.82it/s]

Test Accuracy for PearceRAAF is: 0.91
Forecast Accuracy for PearceRAAF is: 0.91
Test Accuracy for Perth is: 0.88
Forecast Accuracy for Perth is: 0.89
Test Accuracy for PerthAirport is: 0.89
Forecast Accuracy for PerthAirport is: 0.9


 73%|███████████████████████████████▌           | 36/49 [00:02<00:00, 13.44it/s]

Test Accuracy for Portland is: 0.78
Forecast Accuracy for Portland is: 0.78
Test Accuracy for Sale is: 0.84
Forecast Accuracy for Sale is: 0.89


 78%|█████████████████████████████████▎         | 38/49 [00:02<00:00, 13.71it/s]

Test Accuracy for SalmonGums is: 0.87
Forecast Accuracy for SalmonGums is: 0.88
Test Accuracy for SydneyAirport is: 0.8
Forecast Accuracy for SydneyAirport is: 0.82


 86%|████████████████████████████████████▊      | 42/49 [00:03<00:00, 13.84it/s]

Test Accuracy for Townsville is: 0.9
Forecast Accuracy for Townsville is: 0.81
Test Accuracy for Uluru is: 0.93
Forecast Accuracy for Uluru is: 0.92
Test Accuracy for WaggaWagga is: 0.88
Forecast Accuracy for WaggaWagga is: 0.92


 94%|████████████████████████████████████████▎  | 46/49 [00:03<00:00, 12.34it/s]

Test Accuracy for Walpole is: 0.79
Forecast Accuracy for Walpole is: 0.81
Test Accuracy for Watsonia is: 0.84
Forecast Accuracy for Watsonia is: 0.82
Test Accuracy for Williamtown is: 0.82
Forecast Accuracy for Williamtown is: 0.74


100%|███████████████████████████████████████████| 49/49 [00:03<00:00, 13.24it/s]

Test Accuracy for Witchcliffe is: 0.87
Forecast Accuracy for Witchcliffe is: 0.86
Test Accuracy for Wollongong is: 0.86
Forecast Accuracy for Wollongong is: 0.79
Test Accuracy for Woomera is: 0.93
Forecast Accuracy for Woomera is: 0.98





In [10]:
df_final_result[:500]

Unnamed: 0,Date,Location,Latitude,Longitude,RainTomorrow,Forecast,FC_accuracy,Test_accuracy
0,2017-01-01,Adelaide,-34.929075,138.602578,0,0,0.89,0.89
1,2017-01-02,Adelaide,-34.929075,138.602578,0,0,0.89,0.89
2,2017-01-03,Adelaide,-34.929075,138.602578,0,0,0.89,0.89
3,2017-01-04,Adelaide,-34.929075,138.602578,0,0,0.89,0.89
4,2017-01-05,Adelaide,-34.929075,138.602578,0,0,0.89,0.89
...,...,...,...,...,...,...,...,...
495,2017-02-14,Bendigo,-36.757608,144.279570,0,0,0.90,0.88
496,2017-02-15,Bendigo,-36.757608,144.279570,0,0,0.90,0.88
497,2017-02-16,Bendigo,-36.757608,144.279570,0,0,0.90,0.88
498,2017-02-17,Bendigo,-36.757608,144.279570,0,0,0.90,0.88


In [11]:
df_final_result['Forecast'].sum()

874

In [12]:
df_final_result.shape

(5938, 8)

In [13]:
df_final_result.to_csv('LR_forecast.csv',index=False)