## TASK 2

In [22]:
source("./Imports.R")

## Description
Select 4 columns: Year, Month, PostsNumber, MaxScore from Posts table passed as parameter to function  
Year: Extracted year from CreationData  
Month: Extracted month from CreationData  
PostsNumber: Post number  
MaxScore: Max score  
Takes only Posts where PostIdType is 1 or 2  
Groups it by Year then by Month  
Returns only groups with more that 1000 posts  

### Effectively returns months and years when there was at least 1001 posts of category 1 or 2 


In [23]:
sql_2 <- function(Posts){
    sqldf("SELECT STRFTIME('%Y', CreationDate) AS Year, STRFTIME('%m', CreationDate) AS Month,
        COUNT(*) AS PostsNumber, MAX(Score) AS MaxScore
        FROM Posts
        WHERE PostTypeId IN (1, 2)
        GROUP BY Year, Month
        HAVING PostsNumber > 1000")
}

In [24]:
base_2 <- function(Posts){
    Posts$Year <- format(as.POSIXct(Posts$CreationDate), "%Y")
    Posts$Month <- format(as.POSIXct(Posts$CreationDate), "%m")
    posts_with_filtered_id <- subset(Posts, (PostTypeId == 1 | PostTypeId == 2))
    posts_agg_posts_number <- aggregate(
        cbind(PostsNumber = Id) ~ Year + Month,
        data = posts_with_filtered_id,
        FUN = length)
    posts_agg_posts_max <- aggregate(
        cbind(MaxScore = Score) ~ Year + Month,
        data = posts_with_filtered_id,
        FUN = max)
    posts_agg <- merge(posts_agg_posts_number,posts_agg_posts_max,by=c("Year","Month"))
    posts_agg <- posts_agg[posts_agg$PostsNumber > 1000,]
    return(posts_agg)
}

In [25]:
dplyr_2 <- function(Posts){
    as.data.frame(
        Posts %>%
        mutate(Year = format(as.POSIXct(CreationDate), "%Y"), 
            Month = format(as.POSIXct(CreationDate), "%m")) %>%
        filter((PostTypeId == 1 | PostTypeId == 2)) %>%
        group_by(Year, Month) %>%
        summarise(PostsNumber = n(), MaxScore = max(Score),.groups="drop") %>%
        filter(PostsNumber > 1000) 
    )
}

In [26]:
table_2 <- function(Posts){
    setDT(Posts)
    Posts[,Year := format(as.POSIXct(CreationDate), "%Y")]
    Posts[,Month := format(as.POSIXct(CreationDate), "%m")]
    posts_with_filtered_id <- Posts[(PostTypeId == 1 | PostTypeId == 2)]
    posts_agg <- posts_with_filtered_id[,
                            .(PostsNumber = .N, MaxScore = max(Score)),
                            by = .(Year, Month)]
    posts_agg_filtered <- posts_agg[PostsNumber > 1000,]
    return(as.data.frame(posts_agg_filtered))
}

In [27]:
comp(
    sql_2(Posts),
    base_2(Posts),
    dplyr_2(Posts),
    table_2(Posts)
)

[1] "COMPARING GIVEN OUTPUTS TO SQL ONE: "
[1] "BASE: TRUE"
[1] "DPLYR: TRUE"
[1] "TABLE: TRUE"


In [28]:
table_2(Posts)

Year,Month,PostsNumber,MaxScore
<chr>,<chr>,<int>,<int>
2014,5,1012,133
2014,6,1053,74
2014,7,1068,80
2014,8,1030,58
2015,3,1195,73
2015,4,1191,71
2015,5,1233,90
2015,6,1511,165
2015,7,1191,81
2015,8,1269,126
