In [1]:
librarian::shelf(tidyverse, tidymodels, kableExtra, patchwork, 
                skimr, gridExtra, janitor, corrplot, scales,
                GGally, car, forcats, performance, glmmTMB, splines, mgcv, DHARMa)

In [2]:
df <- read_csv("data/weatherAUS.csv")

[1mRows: [22m[34m145460[39m [1mColumns: [22m[34m23[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m   (6): Location, WindGustDir, WindDir9am, WindDir3pm, RainToday, RainTom...
[32mdbl[39m  (16): MinTemp, MaxTemp, Rainfall, Evaporation, Sunshine, WindGustSpeed,...
[34mdate[39m  (1): Date

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [3]:
df %>% 
  head() %>% 
  kable()



|Date       |Location | MinTemp| MaxTemp| Rainfall| Evaporation| Sunshine|WindGustDir | WindGustSpeed|WindDir9am |WindDir3pm | WindSpeed9am| WindSpeed3pm| Humidity9am| Humidity3pm| Pressure9am| Pressure3pm| Cloud9am| Cloud3pm| Temp9am| Temp3pm|RainToday |RainTomorrow |
|:----------|:--------|-------:|-------:|--------:|-----------:|--------:|:-----------|-------------:|:----------|:----------|------------:|------------:|-----------:|-----------:|-----------:|-----------:|--------:|--------:|-------:|-------:|:---------|:------------|
|2008-12-01 |Albury   |    13.4|    22.9|      0.6|          NA|       NA|W           |            44|W          |WNW        |           20|           24|          71|          22|      1007.7|      1007.1|        8|       NA|    16.9|    21.8|No        |No           |
|2008-12-02 |Albury   |     7.4|    25.1|      0.0|          NA|       NA|WNW         |            44|NNW        |WSW        |            4|           22|          44|          25|      10

In [4]:
df %>% 
  tail() %>% 
  kable()



|Date       |Location | MinTemp| MaxTemp| Rainfall| Evaporation| Sunshine|WindGustDir | WindGustSpeed|WindDir9am |WindDir3pm | WindSpeed9am| WindSpeed3pm| Humidity9am| Humidity3pm| Pressure9am| Pressure3pm| Cloud9am| Cloud3pm| Temp9am| Temp3pm|RainToday |RainTomorrow |
|:----------|:--------|-------:|-------:|--------:|-----------:|--------:|:-----------|-------------:|:----------|:----------|------------:|------------:|-----------:|-----------:|-----------:|-----------:|--------:|--------:|-------:|-------:|:---------|:------------|
|2017-06-20 |Uluru    |     3.5|    21.8|        0|          NA|       NA|E           |            31|ESE        |E          |           15|           13|          59|          27|      1024.7|      1021.2|       NA|       NA|     9.4|    20.9|No        |No           |
|2017-06-21 |Uluru    |     2.8|    23.4|        0|          NA|       NA|E           |            31|SE         |ENE        |           13|           11|          51|          24|      10

In [5]:
df_clean <- df %>%
  clean_names() %>%
  mutate(
    date = as.Date(date),
    month = as.factor(month(date)),
    day = as.factor(wday(date, label = TRUE))
  ) %>%
  filter(!is.na(rainfall))

In [6]:
# Check for duplicates

duplicates <- df_clean %>% 
  get_dupes()

print(paste("Number of duplicate rows: ", nrow(duplicates)))

No variable names specified - using all columns.

No duplicate combinations found of: date, location, min_temp, max_temp, rainfall, evaporation, sunshine, wind_gust_dir, wind_gust_speed, ... and 16 other variables
[1] "Number of duplicate rows:  0"


In [7]:
# Day Distribution

day_tab <- df_clean %>% 
  filter(rainfall > 0) %>% 
  tabyl(day) %>% 
  adorn_pct_formatting() %>% 
  arrange(desc(n))

print(day_tab)

 day    n percent
 Tue 7508   14.7%
 Mon 7480   14.6%
 Fri 7378   14.4%
 Wed 7342   14.4%
 Thu 7314   14.3%
 Sat 7057   13.8%
 Sun 7040   13.8%


In [8]:
# Month distribution

month_tab <- df_clean %>% 
  filter(rainfall > 0) %>% 
  tabyl(month) %>% 
  adorn_pct_formatting() %>% 
  arrange(desc(n))

print(month_tab)

 month    n percent
     6 5448   10.7%
     7 5250   10.3%
     5 4937    9.7%
     8 4704    9.2%
     3 4444    8.7%
     9 4234    8.3%
     4 4001    7.8%
    10 3770    7.4%
    11 3760    7.4%
     1 3702    7.2%
    12 3562    7.0%
     2 3307    6.5%


In [9]:
cat("\nCross-tabulation: Month vs Day:\n")
cross_tab <- df_clean %>% 
  filter(rainfall > 0) %>% 
  tabyl(month, day) %>% 
  adorn_totals(c("row", "col"))
print(cross_tab)


Cross-tabulation: Month vs Day:
 month  Sun  Mon  Tue  Wed  Thu  Fri  Sat Total
     1  536  570  514  482  493  567  540  3702
     2  480  530  469  466  443  471  448  3307
     3  636  645  639  592  662  644  626  4444
     4  578  597  566  604  579  515  562  4001
     5  710  722  765  714  678  681  667  4937
     6  766  801  804  797  760  753  767  5448
     7  694  717  728  796  773  818  724  5250
     8  612  702  694  679  689  687  641  4704
     9  516  571  616  639  648  659  585  4234
    10  507  606  579  550  517  456  555  3770
    11  556  526  554  480  538  575  531  3760
    12  449  493  580  543  534  552  411  3562
 Total 7040 7480 7508 7342 7314 7378 7057 51119


In [10]:
print(paste("Total Missing values: ", sum(is.na(df))))

[1] "Total Missing values:  343248"


In [12]:
missing_tab <- df_clean %>%
  summarise(across(everything(), ~ mean(is.na(.)) * 100)) %>%
  pivot_longer(everything(), names_to = "column", values_to = "pct_missing") %>%
  arrange(desc(pct_missing))

missing_tab %>% 
  kable()



|column          | pct_missing|
|:---------------|-----------:|
|sunshine        |  47.6937250|
|evaporation     |  42.5375706|
|cloud3pm        |  39.9960619|
|cloud9am        |  37.5044832|
|pressure3pm     |   9.8404349|
|pressure9am     |   9.8031632|
|wind_dir9am     |   6.8840147|
|wind_gust_dir   |   6.8390073|
|wind_gust_speed |   6.7968129|
|wind_dir3pm     |   2.6716081|
|humidity3pm     |   2.5527606|
|temp3pm         |   1.9310966|
|wind_speed3pm   |   1.8614758|
|humidity9am     |   1.0928347|
|rain_tomorrow   |   0.9929746|
|wind_speed9am   |   0.7672347|
|temp9am         |   0.4817193|
|min_temp        |   0.3424778|
|max_temp        |   0.3305227|
|date            |   0.0000000|
|location        |   0.0000000|
|rainfall        |   0.0000000|
|rain_today      |   0.0000000|
|month           |   0.0000000|
|day             |   0.0000000|

In [13]:
# Obtain the summary statistics for the target variable

rainfall_stats <- df_clean %>% 
  summarise(
    n = n(),
    mean = mean(rainfall),
    median = median(rainfall),
    sd = sd(rainfall),
    min = min(rainfall),
    max = max(rainfall),
    q25 = quantile(rainfall, 0.25),
    q75 = quantile(rainfall, .75),
    iqr = IQR(rainfall),
    n_zeros = sum(rainfall == 0),
    pct_zeros = mean(rainfall == 0) * 100,
    n_large = sum(rainfall > 100),
    pct_large = mean(rainfall > 100) * 100,
    skewness = moments::skewness(rainfall),
    kurtosis = moments::kurtosis(rainfall)
  )

t(rainfall_stats) %>% 
  kable()



|          |             |
|:---------|------------:|
|n         | 1.421990e+05|
|mean      | 2.360918e+00|
|median    | 0.000000e+00|
|sd        | 8.478060e+00|
|min       | 0.000000e+00|
|max       | 3.710000e+02|
|q25       | 0.000000e+00|
|q75       | 8.000000e-01|
|iqr       | 8.000000e-01|
|n_zeros   | 9.108000e+04|
|pct_zeros | 6.405108e+01|
|n_large   | 1.510000e+02|
|pct_large | 1.061892e-01|
|skewness  | 9.836122e+00|
|kurtosis  | 1.811458e+02|

In [18]:

rain_check <- df_clean %>%
  summarise(
    total_days = n(),
    dry_days = sum(rainfall == 0),
    rainy_days = sum(rainfall > 0),
    zero_inflation_pct = (dry_days / total_days) * 100
  )

rain_check %>% 
  kable()



| total_days| dry_days| rainy_days| zero_inflation_pct|
|----------:|--------:|----------:|------------------:|
|     142199|    91080|      51119|           64.05108|

# Trim the columns


In [19]:

vif_check <- lm(rainfall ~ ., data = df_clean)
check_collinearity(vif_check)

[34m# Check for Multicollinearity
[39m
[32mLow Correlation

[39m            Term  VIF     VIF 95% CI adj. VIF Tolerance Tolerance 95% CI
            date 1.13 [ 1.12,  1.14]     1.06      0.88     [0.87, 0.89]
     evaporation 2.72 [ 2.69,  2.76]     1.65      0.37     [0.36, 0.37]
        sunshine 3.91 [ 3.85,  3.96]     1.98      0.26     [0.25, 0.26]
 wind_gust_speed 3.25 [ 3.20,  3.29]     1.80      0.31     [0.30, 0.31]
   wind_speed9am 2.14 [ 2.11,  2.16]     1.46      0.47     [0.46, 0.47]
   wind_speed3pm 2.47 [ 2.44,  2.50]     1.57      0.41     [0.40, 0.41]
     humidity9am 4.92 [ 4.85,  5.00]     2.22      0.20     [0.20, 0.21]
        cloud9am 2.47 [ 2.44,  2.50]     1.57      0.40     [0.40, 0.41]
        cloud3pm 2.38 [ 2.35,  2.41]     1.54      0.42     [0.41, 0.43]
      rain_today 1.45 [ 1.44,  1.47]     1.21      0.69     [0.68, 0.70]
   rain_tomorrow 1.60 [ 1.59,  1.62]     1.27      0.62     [0.62, 0.63]
           month 4.90 [ 4.83,  4.98]     1.07      0.20 

In [None]:
df_clean <- df_clean %>% 
  select(-c(min_temp, temp9am, temp3pm, pressure9am, wind_dir3pm,
            wind_dir9am, location, date, rain_tomorrow,
            sunshine, evaporation, cloud3pm, cloud9am))

In [22]:

m1_zigamma <- glmmTMB(
  rainfall ~ max_temp + wind_gust_speed + humidity3pm + pressure3pm + rain_today + month,
  ziformula = ~ humidity3pm + pressure3pm + month, 
  data = df_clean,
  family = ziGamma(link = "log")
)

summary(m1_zigamma)

 Family: Gamma  ( log )
Formula:          
rainfall ~ max_temp + wind_gust_speed + humidity3pm + pressure3pm +  
    rain_today + month
Zero inflation:            ~humidity3pm + pressure3pm + month
Data: df_clean

      AIC       BIC    logLik -2*log(L)  df.resid 
 308357.1  308667.5 -154146.5  308293.1    120851 


Dispersion estimate for Gamma family (sigma^2): 0.707 

Conditional model:
                  Estimate Std. Error z value Pr(>|z|)    
(Intercept)      8.8120567  0.6621234    13.3  < 2e-16 ***
max_temp         0.0192311  0.0010079    19.1  < 2e-16 ***
wind_gust_speed  0.0084532  0.0003088    27.4  < 2e-16 ***
humidity3pm      0.0128827  0.0002545    50.6  < 2e-16 ***
pressure3pm     -0.0108645  0.0006401   -17.0  < 2e-16 ***
rain_todayYes    2.8902092  0.0091246   316.8  < 2e-16 ***
month2           0.0700942  0.0221127     3.2  0.00153 ** 
month3           0.0023127  0.0205786     0.1  0.91052    
month4          -0.0523097  0.0217401    -2.4  0.01612 *  
month5          -

In [None]:
# Generate and visualize predictions

In [None]:
# use tweedie glmmTMB instead of the hurdle zigamma


In [None]:
# Use GAM with tweedie to capture non linear relationships

In [None]:
# Statistical model comparison (AIC x BIC) 

In [None]:
# Diagnose the GAM using Dharma

In [None]:
# Bayesian Inference?