# Australia Weather Forecast

The Dataset contains daily weather observations from numerous Australian weather stations.

- **Goal:** predict the weather next day 
- **Metric:** Accuracy

In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("weather_austria.csv")

In [3]:
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No


In [4]:
df.set_index("Date")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 24 columns):
Date             145460 non-null object
Location         145460 non-null object
MinTemp          143975 non-null float64
MaxTemp          144199 non-null float64
Rainfall         142199 non-null float64
Evaporation      82670 non-null float64
Sunshine         75625 non-null float64
WindGustDir      135134 non-null object
WindGustSpeed    135197 non-null float64
WindDir9am       134894 non-null object
WindDir3pm       141232 non-null object
WindSpeed9am     143693 non-null float64
WindSpeed3pm     142398 non-null float64
Humidity9am      142806 non-null float64
Humidity3pm      140953 non-null float64
Pressure9am      130395 non-null float64
Pressure3pm      130432 non-null float64
Cloud9am         89572 non-null float64
Cloud3pm         86102 non-null float64
Temp9am          143693 non-null float64
Temp3pm          141851 non-null float64
RainToday        142199 non-null obje

In [5]:
df.isnull().values.sum()

346515

In [6]:
df = df.dropna(how='any')

In [7]:
# transforming yes/no values into numerical values
df['RainToday'].replace({'No': 0, 'Yes': 1},inplace = True)
df['RainTomorrow'].replace({'No': 0, 'Yes': 1},inplace = True)

# create dummy variables for categorical features
categorical_columns = ['WindGustDir', 'WindDir3pm', 'WindDir9am']
for col in categorical_columns:
    print(np.unique(df[col]))
df = pd.get_dummies(df, columns=categorical_columns)

['E' 'ENE' 'ESE' 'N' 'NE' 'NNE' 'NNW' 'NW' 'S' 'SE' 'SSE' 'SSW' 'SW' 'W'
 'WNW' 'WSW']
['E' 'ENE' 'ESE' 'N' 'NE' 'NNE' 'NNW' 'NW' 'S' 'SE' 'SSE' 'SSW' 'SW' 'W'
 'WNW' 'WSW']
['E' 'ENE' 'ESE' 'N' 'NE' 'NNE' 'NNW' 'NW' 'S' 'SE' 'SSE' 'SSW' 'SW' 'W'
 'WNW' 'WSW']


In [8]:
# selecting important features that are most valuable for our prediction
df = df[['Humidity3pm','Rainfall','RainToday','RainTomorrow', 'Sunshine']]
X = df[['Humidity3pm', "Rainfall", "RainToday", 'Sunshine']] # let's use only one feature Humidity3pm
y = df[['RainTomorrow']]


# Model Selection

#### Comapring different models:
- Decision Trees
- Random Forest
- Support Vector machine
- Adaboost
- Logistic Regression

In [9]:
# spliting the data into a training and test set
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)


### Decision Tree Classifier


In [10]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier(criterion='gini', random_state=0)

t0=time.time()
dt_model = dt_clf.fit(X_train, y_train)
print("Trainings Time:", time.time()-t0)

t0=time.time()
y_pred = dt_model.predict(X_test)
print("Prediction Time:", time.time()-t0)

dt_acc = accuracy_score(y_test,y_pred)
print("---------------------------------------")
print("Decision Tree accuracy:", dt_acc)

Trainings Time: 0.060053348541259766
Prediction Time: 0.003958702087402344
---------------------------------------
Decision Tree accuracy: 0.7932647997164126


### Random Forest Classifier

In [11]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(criterion='entropy', random_state=0, n_jobs=-1)

t0=time.time()
rf_model = rf_clf.fit(X_train, y_train)
print("Trainings Time:", time.time()-t0)

t0=time.time()
y_pred = rf_model.predict(X_test)
print("Prediction Time:", time.time()-t0)

rf_acc = accuracy_score(y_pred, y_test)
print("---------------------------------------")
print("Random Forest accuracy:", rf_acc)

  


Trainings Time: 0.2442641258239746
Prediction Time: 0.10908150672912598
---------------------------------------
Random Forest accuracy: 0.8045373980857852


### Support Vector Machine

In [12]:
from sklearn import svm

scaler = StandardScaler()
X_std = scaler.fit_transform(X)
svc_clf = svm.SVC(kernel='rbf', gamma='scale')

t0=time.time()
svc_model = svc_clf.fit(X_std, y)
print("Trainings Time:", time.time()-t0)

t0=time.time()
y_pred = svc_model.predict(X_test)
print("Prediction Time:", time.time()-t0)

svc_acc = accuracy_score(y_pred, y_test)
print("---------------------------------------")
print("Support Vector Machine accuracy:", svc_acc)


  y = column_or_1d(y, warn=True)


Trainings Time: 45.64207458496094
Prediction Time: 8.392092943191528
---------------------------------------
Support Vector Machine accuracy: 0.7822048918823112


### Adaboos Classifier

In [15]:
from sklearn.ensemble import AdaBoostClassifier

ab_clf = AdaBoostClassifier(n_estimators=50,
                         learning_rate=1,
                         random_state=0)

t0=time.time()
ab_model = ab_clf.fit(X, y)
print("Trainings Time:", time.time()-t0)

t0=time.time()
y_pred = ab_model.predict(X_test)
print("Prediction Time:", time.time()-t0)

ab_acc = accuracy_score(y_pred, y_test)
print("---------------------------------------")
print("Adaboost accuracy:", ab_acc)

  y = column_or_1d(y, warn=True)


Trainings Time: 0.8977203369140625
Prediction Time: 0.06228351593017578
---------------------------------------
Adaboost accuracy: 0.8366536689117334


### Logistic Regression Classifier

In [16]:
from sklearn.linear_model import LogisticRegression

scaler = StandardScaler()
X_std = scaler.fit_transform(X)

lr_clf = LogisticRegression(random_state=0)

t0=time.time()
lr_model = lr_clf.fit(X_std, y)
print("Trainings Time:", time.time()-t0)

t0=time.time()
y_pred = lr_model.predict(X_test)
print("Prediction Time:", time.time()-t0)

lr_acc = accuracy_score(y_pred, y_test)
print("---------------------------------------")
print("Logistic Regression accuracy:", lr_acc)


Trainings Time: 0.09202790260314941
Prediction Time: 0.0040395259857177734
---------------------------------------
Logistic Regression accuracy: 0.2370081531371854


  y = column_or_1d(y, warn=True)


### Conclusion

As we can see, adaboost is outperforming all other models by a significant amount when it comes to the training the model and predictions.

Im not enitrely sure why the logisitic regression model is underperforming, so it will need further investigations. For now I would pick the adaboost classifier folowed by decision tree classifier. 