In [None]:
## EXAMPLE 1

# Consider some crop yield data from Frossard 2019, in particular the number of ears of wheat and the number of ears with grains
crop_data <- read.table('Frossard_2019.csv',header=TRUE,sep=",")
plot(crop_data$nr.ears.m2,crop_data$DW.ears_with_grains.g.m2, type = "p",col="darkgreen",cex=3,lwd=4)

# Let's calculate the correlation coefficient and p-value.
# By default, it is the Pearson method.

result <- cor.test(crop_data$nr.ears.m2,crop_data$DW.ears_with_grains.g.m2)
result

# print out the results in a nicer way.
cat("\nCorrelation\n-------\nr =",result$estimate,"\nr2 =",result$estimate**2,"\np =",result$p.value,"\n")

In [None]:
# Let's look now at the average annual temperature.
# The years have been relabelled for simplicitly to start from 1.
Temperatures <- c(51.5,52.0,52.5,52.7,48.6,52.3,49.6,50.8,51.0,52.8,52.0,52.6,53.0,52.9,51.4,50.8,51.2,50.3,51.0,50.4,51.6,50.6,49.7,51.0,53.9,53.5,52.1,50.6,51.8,51.7,51.2,52.4,50.1,53.6,50.3,54.7,53.9,54.3,53.4,52.9,53.3,53.7,53.8,52.0,55.0,52.1,53.4,53.8,53.8,51.9,52.1,52.7,51.8,56.6,53.3,55.6,56.3,56.2,56.1,56.2,53.6,55.7,56.3)
Years <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63)
plot(Years, Temperatures, type = "p",col="darkred",cex=3,lwd=4)
result <- cor.test(Years,Temperatures)
cat("\nCorrelation of annual temperatures\nr =",result$estimate,"\nr2 =",result$estimate**2,"\np =",result$p.value,"\n")

# Let's also look the Spearman and Kendall correlations.
# We have "ties" in our data, where the temperatures are the same across two years.
# We use an approximation to handle these ties rather than the exact method.

result <- cor.test(Years,Temperatures,method='spearman',exact=F)
cat("\nCorrelation of annual temperatures (Spearman)\nr =",result$estimate,"\nr2 =",result$estimate**2,"\np =",result$p.value,"\n")
result <- cor.test(Years,Temperatures,method='kendall',exact=F)
cat("\nCorrelation of annual temperatures (Kendall)\nr =",result$estimate,"\nr2 =",result$estimate**2,"\np =",result$p.value,"\n")

In [None]:
## EXAMPLE 2

# Consider 3 random variables that are normally distributed (with identical parameters).

X <- rnorm(200,50,10)
Y <- rnorm(200,50,10)
Z <- rnorm(200,50,10)

# Make a colour palette ("viridis"-like) that goes from blue->yellow.
# Roughly bin the Z values so we can colour each scatter point by the Z value.
# Let's calculate the correlation as well.

YlOrBrRdBu <- c("#FDE725", "#21908C", "#3B528B", "#440154")
col <- colorRampPalette(YlOrBrRdBu)(50)
cols = col[cut(Z,50)]
plot(X, Y, col=cols,type = "p",cex=3,lwd=2)
result <- cor.test(X,Y)
cat("Correlation of X and Y\nr =",result$estimate,"\nr2 =",result$estimate**2,"\np =",result$p.value,"\n")

In [None]:
# Now let's divide X & Y by Z.
# This may be some normalisation procedure or similar approach.
# We are using the same colours as determined in the previous panel.
plot(X/Z, Y/Z, col=cols,type = "p",cex=3,lwd=2)
result <- cor.test(X/Z,Y/Z)
cat("Correlation of X and Y\nr =",result$estimate,"\nr2 =",result$estimate**2,"\np =",result$p.value,"\n")

In [None]:

# Install the corrplot package if necessary.

install.packages("corrplot")

In [None]:
## Example 3

# Load the corrplot package.

library(corrplot)

# Make a matrix with random expoential values, calculate the (pairwise) correlation matrix.

A <- matrix(rexp(100,5),nrow=10)

colnames(A) <- c("Alpha","Bravo","Charlie","Delta","Echo","Foxtrot","Golf","Hotel","India","Juliett")

correlations <- cor(A,method="pearson")

# Also calculate the p-values for the correlations.
p = cor.mtest(A)$p

# Plot the correlation matrix, indicate which correlations are not significant.

corrplot(correlations,type = "upper",p.mat=p,sig.level=0.25)

In [None]:
## Further investigations

## EXAMPLE 1
# What will sorting the data (like ears <- sort(crop_data$nr.ears.m2)) first do to the correlation coefficients and significance?
#
# How does the correlation test relate to the linear regression results using the same variables?


# What happens if you change year values slightly, how much do the correlations change?
# Tip: You can add normally distributed random noise (mean=0 and sd=0.05) to Years as Years+rnorm(length(Years),0,.05)
#
# What happens to the correlations if you change one value to be a huge outlier?

## EXAMPLE 2
# What happens if you alter the standard deviation for X/Y/Z?
#
# Does it match the equation expectations from the slides?
#
# What could we do to make the results match expectations better?

## EXAMPLE 3
# Verify that the correlation matrix values match calculating the correlations between each column.
# Hint: can access each column like A[,"Alpha"] or A[,"Golf"], and each element of the correlation matrix like correlations["Alpha","Golf"].
#
# If we change the distribution from exponential to normal, how many correlations are "falsely" significant?
# What is the connection to multiple hypothesis correction (hint: there are approximately N^2 elements in the correlation matrix).
#
# If we change the correlation to spearman (from pearson), would we expect to see more significant correlations with the exponential distribution?



In [None]:
## EXAMPLE 1
# What will sorting the data (like ears <- sort(crop_data$nr.ears.m2)) first do to the correlation coefficients and significance?

# Sorting the data independently removes the relationship between the two variables, and so destroys the original correlation.

ears <- sort(crop_data$nr.ears.m2)
weight <- sort(crop_data$DW.ears_with_grains.g.m2)
plot(ears, weight, type = "p",col="darkgreen",cex=3,lwd=4)
result <- cor.test(ears,weight)
cat("Correlation of sports scores\nr =",result$estimate,"\nr2 =",result$estimate**2,"\np =",result$p.value,"\n")

# The plot now looks very correlated, as both variables were sorted separately, and indeed there is a strong and significant correlation.
# Preprocessing data without careful thought can introduce or remove many important correlations, so be careful!
# Instead, if we sort row by row, preserving the relationship, the plot looks the same as before.

sorted_crop <- crop_data[order(crop_data$nr.ears.m2),]
plot(sorted_crop$nr.ears.m2, sorted_crop$DW.ears_with_grains.g.m2, type = "p",col="darkgreen",cex=3,lwd=4)

# How does the correlation test relate to the linear regression results using the same variables?

# Plot the data again, but now add the linear regression (remember it is y ~ x).
plot(crop_data$nr.ears.m2,crop_data$DW.ears_with_grains.g.m2, type = "p",col="darkgreen",cex=3,lwd=4)
model <- lm(crop_data$DW.ears_with_grains.g.m2 ~ crop_data$nr.ears.m2)
abline(model, col="red2")
summary(model)

## Note the similarities in Multiple R-squared and p-value.

In [None]:
## EXAMPLE 2
# What happens if you change year values slightly, how much do the correlations change?
# Tip: You can add normally distributed random noise (mean=0 and sd=0.05) to Years as Years+rnorm(length(Years),0,.05)


Temperatures <- c(51.5,52.0,52.5,52.7,48.6,52.3,49.6,50.8,51.0,52.8,52.0,52.6,53.0,52.9,51.4,50.8,51.2,50.3,51.0,50.4,51.6,50.6,49.7,51.0,53.9,53.5,52.1,50.6,51.8,51.7,51.2,52.4,50.1,53.6,50.3,54.7,53.9,54.3,53.4,52.9,53.3,53.7,53.8,52.0,55.0,52.1,53.4,53.8,53.8,51.9,52.1,52.7,51.8,56.6,53.3,55.6,56.3,56.2,56.1,56.2,53.6,55.7,56.3)
Years <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63)
Years <- Years+rnorm(length(Years),0,2)
plot(Years, Temperatures, type = "p",col="darkred",cex=3,lwd=4)
result <- cor.test(Years,Temperatures,method="pearson",exact=F)
cat("\nCorrelation of annual temperatures\nr =",result$estimate,"\nr2 =",result$estimate**2,"\np =",result$p.value,"\n")

# There is a slight but insignificant change to the correlation and p value when using pearson R.
# Since the years don't change enough to swap ranks, the spearman R is totally unchanged.
# Increasing the variance to say 2 will affect both correlations.


# What happens to the correlations if you change one value to be a huge outlier?

# Pearson correlation can be totally changed (depending on how big the outlier is), but there will be almost no effect to spearman, as the coordinate doesn't matter, only the ranks.

In [None]:
## EXAMPLE 2
# What happens if you alter the standard deviation for X/Y/Z?

# If the standard deviation is huge the result is less significant, while a lower deviation is more reliably significant.
# Huge outliers with high deviation can skew the data much more, so this is expected.

# Does it match the equation expectations from the slides?

X <- rnorm(200,50,10)
Y <- rnorm(200,50,10)
Z <- rnorm(200,50,10)
result <- cor.test(X/Z,Y/Z)
cat("Correlation of X and Y\nr =",result$estimate,"\nr2 =",result$estimate**2,"\np =",result$p.value,"\n")

X <- rnorm(200,50,5)
Y <- rnorm(200,50,5)
Z <- rnorm(200,50,20)
result <- cor.test(X/Z,Y/Z)
cat("Correlation of X and Y\nr =",result$estimate,"\nr2 =",result$estimate**2,"\np =",result$p.value,"\n")

# Yes, we can calculate this.
# In the first case, the equation predicts r=0.5, and in the second it is about 0.941


# What could we do to make the results match expectations better?

X <- rnorm(200,10000,5)
Y <- rnorm(200,10000,5)
Z <- rnorm(200,10000,20)
result <- cor.test(X/Z,Y/Z)
cat("Correlation of X and Y\nr =",result$estimate,"\nr2 =",result$estimate**2,"\np =",result$p.value,"\n")

# The equation shown in the lectures is a slight approximation, as there is also a dependence on the mean in the full expression.
# If we increase the mean, we can get r values much closer to 0.941.

In [None]:
## EXAMPLE 3
# Verify that the correlation matrix values match calculating the correlations between each column.
# Hint: can access each column like A[,"Alpha"] or A[,"Golf"], and each element of the correlation matrix like correlations["Alpha","Golf"].
#
cat("Direct test\nr = ",cor.test(A[,"Alpha"],A[,"Golf"])$estimate,"\nMatrix correlation\nr =",correlations["Alpha","Golf"])

# Yes they are the same.


# If we change the distribution from exponential to normal, how many correlations are "falsely" significant?
# What is the connection to multiple hypothesis correction (hint: there are approximately N^2 elements in the correlation matrix).

A <- matrix(rexp(100),nrow=10)
colnames(A) <- c("Alpha","Bravo","Charlie","Delta","Echo","Foxtrot","Golf","Hotel","India","Juliett")
correlations <- cor(A,method="pearson")
p = cor.mtest(A)$p
corrplot(correlations,type = "upper",p.mat=p,sig.level=0.05)

# This is similar to what I showed in the lecture.
# There are about (N**2)/2 tests, so out of the ~50 tests we might expect 2-3 to be signficant with p=0.05 by chance.
# None would be significant if we did a correction.


# If we change the correlation to spearman (from pearson), would we expect to see more significant correlations with the exponential distribution?

# An exponential distribution generally has more extreme values (outliers) than a normal distribution, which could either lead to a larger or smaller correlation.
# Since spearman is more robust to outliers (and we don't expect any signficance from a random set of values), we would expect it to have fewer significant correlations compared to pearson.

In [None]:
library(datasauRus)
dino <- datasaurus_dozen[datasaurus_dozen$dataset == 'dino',]
cor.test(dino$x,dino$y)
slant_down <- datasaurus_dozen[datasaurus_dozen$dataset == 'slant_down',]
cor.test(slant_down$x,slant_down$y)
slant_up <- datasaurus_dozen[datasaurus_dozen$dataset == 'slant_up',]
cor.test(slant_up$x,slant_up$y)