# RFC vs LR in rainfall prediction

In [14]:
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, roc_auc_score, recall_score, precision_score, average_precision_score, f1_score, classification_report, accuracy_score
from sklearn import metrics
import folium
import time
from bs4 import BeautifulSoup
import requests
import json
import random
warnings.filterwarnings('ignore')

### Data
https://www.kaggle.com/datasets/jsphyg/weather-dataset-rattle-package

In [15]:
data_weather = pd.read_csv('weatherAUS.csv')

In [16]:
data_weather.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [17]:
data_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [18]:
data_weather['Date'] = pd.to_datetime(data_weather['Date'], format='%Y-%m-%d')
data_weather = data_weather[data_weather['RainToday'].notna()]
data_weather = data_weather[data_weather['RainTomorrow'].notna()]

### Scrapping latitude&longitude of locations

In [19]:
locs = data_weather.Location.unique()
locs.sort()
api = "http://api.positionstack.com/v1/forward"
token = 'f898acba181f26412807e4fc0bf8c2ea'
locs_df = pd.DataFrame(locs, columns=['Location'])
locs_df['latitude'] = ''
locs_df['longitude'] = ''
for ind, row in locs_df.iterrows():
    try:
        query = row['Location'] + ', Australia'
        url = f'{api}?access_key={token}&query={query}&limit=1'
        page = requests.get(url)
        doc = BeautifulSoup(page.content, 'html.parser')
        json_dict = json.loads(str(doc))
        locs_df.loc[int(ind), 'latitude'] = str(json_dict['data'][0]['latitude'])
        locs_df.loc[int(ind), 'longitude'] = str(json_dict['data'][0]['longitude'])
    except:
        locs_df.loc[int(ind), 'latitude'] = '-33.865143'
        locs_df.loc[int(ind), 'longitude'] = '151.209900'


In [20]:
locs_df.head()

Unnamed: 0,Location,latitude,longitude
0,Adelaide,-34.929075,138.602578
1,Albany,-35.031075,117.89494
2,Albury,-36.074771,146.914504
3,AliceSprings,-33.865143,151.2099
4,BadgerysCreek,-33.865143,151.2099


# Functions

In [21]:
# Converts categorical to num factors
def str_to_num(data, col):
    mapdict = {}
    values = list(data[col].unique())
    for i in range(len(values)):
        mapdict[values[i]] = i
    return mapdict

# Cleans data a bit
def clean_data(data):
    n = len(data)
    data = data.drop(['Location'], axis = 1)
    for col in data.columns:
        b = round(data[col].isna().sum()/n,2)
        if (b>= 0.1):
            data = data.drop([col], axis = 1)
        else:
            try:
                data[col].fillna((data[col].mean()), inplace=True)
            except:
                pass
            if (data[col].dtype == 'O'):
                data[col].replace(str_to_num(data, col),inplace = True)
    data = data.dropna()
    return data

# Produces RFC classified output
def get_RFC(locaction):
    try:
        data = data_weather[data_weather['Location'] == locaction]
        df = clean_data(data)
        in_sample = df[df.Date.dt.year != 2017]
        out_of_sample = df[df.Date.dt.year == 2017]

        X=in_sample.drop(['RainTomorrow', 'Date'], axis = 1)
        y=in_sample['RainTomorrow']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        clf = RandomForestClassifier(n_estimators=100)
        clf.fit(X_train,y_train)

        y_pred = clf.predict(X_test)
        test_error = round(metrics.accuracy_score(y_test, y_pred),2)

        X_out = out_of_sample.drop(['RainTomorrow', 'Date'], axis = 1)
        y_out = out_of_sample['RainTomorrow']

        y_fc = clf.predict(X_out)
        forecast_error = round(metrics.accuracy_score(y_out, y_fc),2)
        print(f'Test Accuracy for {locaction} is: {test_error}')
        print(f'Forecast Accuracy for {locaction} is: {forecast_error}')
        df_final = out_of_sample
        df_final['Forecast'] = y_fc
        df_final['FC_accuracy'] = forecast_error
        df_final['Test_accuracy'] = test_error
    except:
        pass

    return df_final
    
# Produces LR classified output
def get_LR(location):
    try:
        data = data_weather[data_weather['Location'] == location]
        df = clean_data(data)
        df.replace({'No':0, 'Yes':1}, inplace=True)
        df = pd.get_dummies(df, columns=['WindGustDir','WindDir9am','WindDir3pm'], drop_first=True)
        in_sample = df[df.Date.dt.year != 2017]
        out_of_sample = df[df.Date.dt.year == 2017]

        X=in_sample.drop(['RainTomorrow', 'Date'], axis = 1)
        y=in_sample['RainTomorrow']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        clf = LogisticRegression(penalty='l2') # logistic regression with no penalty term in the cost function.

        clf.fit(X_train, y_train)
        test_prob = clf.predict_proba(X_test)[:, 1]
        test_pred = clf.predict(X_test)
        test_error = round(metrics.accuracy_score(y_test, test_pred),2)


        X_out = out_of_sample.drop(['RainTomorrow', 'Date'], axis = 1)
        y_out = out_of_sample['RainTomorrow']

        y_fc = clf.predict(X_out)
        forecast_error = round(metrics.accuracy_score(y_out, y_fc),2)
        print(f'Test Accuracy for {location} is: {test_error}')
        print(f'Forecast Accuracy for {location} is: {forecast_error}')
        df_final = out_of_sample
        df_final['Forecast'] = y_fc
        df_final['FC_accuracy'] = forecast_error
        df_final['Test_accuracy'] = test_error
    except:
        pass
    
    return df_final

### Run Random Forest Classifier

In [22]:
collect = []
for loc in locs_df['Location']:
    try:     
        df_with_results = get_RFC(loc)
        df_with_results['Location'] = str(loc)
        df_with_results['Latitude'] = float(locs_df[locs_df['Location'] == loc]['latitude'].values)
        df_with_results['Longitude'] = float(locs_df[locs_df['Location'] == loc]['longitude'].values)
        df_with_results['Date'] = df_with_results.Date.dt.date
        df_with_results = df_with_results[['Date', 'Location', 'Latitude', 'Longitude', 'RainTomorrow', 'Forecast', 'FC_accuracy', 'Test_accuracy']]
        collect.append(df_with_results)
    except:
        pass

df_final_result_RFC = pd.concat(collect, axis=0, ignore_index=True).reset_index(drop = True)



Test Accuracy for Adelaide is: 0.86
Forecast Accuracy for Adelaide is: 0.91
Test Accuracy for Albany is: 0.78
Forecast Accuracy for Albany is: 0.82
Test Accuracy for Albury is: 0.87
Forecast Accuracy for Albury is: 0.92
Test Accuracy for AliceSprings is: 0.94
Forecast Accuracy for AliceSprings is: 0.96
Test Accuracy for BadgerysCreek is: 0.85
Forecast Accuracy for BadgerysCreek is: 0.83
Test Accuracy for Ballarat is: 0.82
Forecast Accuracy for Ballarat is: 0.86
Test Accuracy for Bendigo is: 0.87
Forecast Accuracy for Bendigo is: 0.92
Test Accuracy for Brisbane is: 0.86
Forecast Accuracy for Brisbane is: 0.86
Test Accuracy for Cairns is: 0.83
Forecast Accuracy for Cairns is: 0.76
Test Accuracy for Canberra is: 0.85
Forecast Accuracy for Canberra is: 0.88
Test Accuracy for Cobar is: 0.89
Forecast Accuracy for Cobar is: 0.91
Test Accuracy for CoffsHarbour is: 0.81
Forecast Accuracy for CoffsHarbour is: 0.74
Test Accuracy for Dartmoor is: 0.83
Forecast Accuracy for Dartmoor is: 0.85
Test A

#### Check results

In [23]:
df_final_result_RFC.head()

Unnamed: 0,Date,Location,Latitude,Longitude,RainTomorrow,Forecast,FC_accuracy,Test_accuracy
0,2017-01-01,Adelaide,-34.929075,138.602578,0,0,0.91,0.86
1,2017-01-02,Adelaide,-34.929075,138.602578,0,0,0.91,0.86
2,2017-01-03,Adelaide,-34.929075,138.602578,0,0,0.91,0.86
3,2017-01-04,Adelaide,-34.929075,138.602578,0,0,0.91,0.86
4,2017-01-05,Adelaide,-34.929075,138.602578,0,0,0.91,0.86


### Run Logistic Regression

In [32]:
collect = []
for loc in locs_df['Location']:
    try:     
        df_with_results_LR = get_LR(loc)
        df_with_results_LR['Location'] = str(loc)
        df_with_results_LR['Latitude'] = float(locs_df[locs_df['Location'] == loc]['latitude'].values)
        df_with_results_LR['Longitude'] = float(locs_df[locs_df['Location'] == loc]['longitude'].values)
        df_with_results_LR['Date'] = df_with_results_LR.Date.dt.date
        df_with_results_LR = df_with_results_LR[['Date', 'Location', 'Latitude', 'Longitude', 'RainTomorrow', 'Forecast', 'FC_accuracy', 'Test_accuracy']]
        collect.append(df_with_results_LR)
    except:
        pass

df_final_result_LR = pd.concat(collect, axis=0, ignore_index=True).reset_index(drop = True)

Test Accuracy for Adelaide is: 0.85
Forecast Accuracy for Adelaide is: 0.89
Test Accuracy for AliceSprings is: 0.95
Forecast Accuracy for AliceSprings is: 0.94
Test Accuracy for Ballarat is: 0.85
Forecast Accuracy for Ballarat is: 0.87
Test Accuracy for Bendigo is: 0.86
Forecast Accuracy for Bendigo is: 0.91
Test Accuracy for Brisbane is: 0.85
Forecast Accuracy for Brisbane is: 0.84
Test Accuracy for Cairns is: 0.81
Forecast Accuracy for Cairns is: 0.74
Test Accuracy for Cobar is: 0.93
Forecast Accuracy for Cobar is: 0.95
Test Accuracy for Darwin is: 0.86
Forecast Accuracy for Darwin is: 0.87
Test Accuracy for GoldCoast is: 0.86
Forecast Accuracy for GoldCoast is: 0.71
Test Accuracy for Hobart is: 0.8
Forecast Accuracy for Hobart is: 0.87
Test Accuracy for Katherine is: 0.89
Forecast Accuracy for Katherine is: 0.88
Test Accuracy for Melbourne is: 0.85
Forecast Accuracy for Melbourne is: 0.85
Test Accuracy for MelbourneAirport is: 0.81
Forecast Accuracy for MelbourneAirport is: 0.88
Tes

#### Check results

In [33]:
df_final_result_LR

Unnamed: 0,Date,Location,Latitude,Longitude,RainTomorrow,Forecast,FC_accuracy,Test_accuracy
0,2017-01-01,Adelaide,-34.929075,138.602578,0,0,0.89,0.85
1,2017-01-02,Adelaide,-34.929075,138.602578,0,0,0.89,0.85
2,2017-01-03,Adelaide,-34.929075,138.602578,0,0,0.89,0.85
3,2017-01-04,Adelaide,-34.929075,138.602578,0,0,0.89,0.85
4,2017-01-05,Adelaide,-34.929075,138.602578,0,0,0.89,0.85
...,...,...,...,...,...,...,...,...
6318,2017-06-20,Woomera,-31.149586,136.800011,0,0,0.98,0.96
6319,2017-06-21,Woomera,-31.149586,136.800011,0,0,0.98,0.96
6320,2017-06-22,Woomera,-31.149586,136.800011,0,0,0.98,0.96
6321,2017-06-23,Woomera,-31.149586,136.800011,0,0,0.98,0.96


## Merge Results to get a dataset and put it into Tableau

In [34]:
df_fin = df_final_result_RFC.merge(df_final_result_LR, how='left',
                                   on= ['Date', 'Location', 'Latitude', 'Longitude', 'RainTomorrow'],
                                   suffixes=('_RFC', '_LR'))

In [35]:
df_fin.head()

Unnamed: 0,Date,Location,Latitude,Longitude,RainTomorrow,Forecast_RFC,FC_accuracy_RFC,Test_accuracy_RFC,Forecast_LR,FC_accuracy_LR,Test_accuracy_LR
0,2017-01-01,Adelaide,-34.929075,138.602578,0,0,0.91,0.86,0.0,0.89,0.85
1,2017-01-02,Adelaide,-34.929075,138.602578,0,0,0.91,0.86,0.0,0.89,0.85
2,2017-01-03,Adelaide,-34.929075,138.602578,0,0,0.91,0.86,0.0,0.89,0.85
3,2017-01-04,Adelaide,-34.929075,138.602578,0,0,0.91,0.86,0.0,0.89,0.85
4,2017-01-05,Adelaide,-34.929075,138.602578,0,0,0.91,0.86,0.0,0.89,0.85


In [36]:
df_fin.to_csv('Australia_forecast.csv', header = True, index=False)