## TASK 3

In [14]:
source("./Imports.R")

## Description
Select 3 columns: Id, DisplayName, TotalViews from given Questions table:  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;OwnerUserId, TotalViews from table Posts  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;TotalViews: sum of ViewCount  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Takes only Posts where PostIdType is 1  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Groups the output by OwnerUsedId  
Joins Questions table and Users table by user id  
Sorts the data by TotalViews (Descending)  
Takes first 10 outputs  

### Effectively returns top 10 Users with highest view count on posts with category 1

In [15]:
sql_3 <- function(Posts,Users){
    sqldf("SELECT Id, DisplayName, TotalViews
        FROM (
            SELECT OwnerUserId, SUM(ViewCount) as TotalViews
            FROM Posts
            WHERE PostTypeId = 1
            GROUP BY OwnerUserId
            ) AS Questions
        JOIN Users
        ON Users.Id = Questions.OwnerUserId
        ORDER BY TotalViews DESC
        LIMIT 10")
}

In [16]:
base_3 <- function(Posts,Users){
    posts_with_filtered_id <- subset(Posts, PostTypeId == 1)
    posts_with_selected_fields <- posts_with_filtered_id[,c("OwnerUserId","ViewCount")]
    posts_agg <- aggregate(posts_with_selected_fields$ViewCount, 
                             by = list(posts_with_selected_fields$OwnerUserId), 
                             FUN = sum)
    colnames(posts_agg) <- c("Id", "TotalViews")
    df_joined <- merge(posts_agg, Users, by = "Id")
    df_with_selected_fields <- df_joined[,c("Id","DisplayName","TotalViews")]
    df_ordered = df_with_selected_fields[order(df_with_selected_fields$TotalViews, decreasing = TRUE),]
    return(head(df_ordered,10))
}

In [17]:
dplyr_3 <- function(Posts,Users){
    as.data.frame(
        Posts %>%
            filter(PostTypeId == 1) %>%
            group_by(OwnerUserId) %>%
            summarise(TotalViews = sum(ViewCount)) %>%
            inner_join(Users, by = c("OwnerUserId" = "Id")) %>%
            select(OwnerUserId, DisplayName, TotalViews) %>%
            arrange(desc(TotalViews)) %>%
            head(10) %>%
            rename("Id" = "OwnerUserId")
    )
}

In [18]:
table_3 <- function(Posts,Users){
    setDT(Posts)
    setDT(Users)
    posts_filtered <- Posts[PostTypeId == 1, 
                            .(TotalViews = sum(ViewCount)),
                            by = OwnerUserId]
    joined <- merge(Users, posts_filtered, by.x = "Id", by.y = "OwnerUserId")
    return(as.data.frame(joined[, 
                            .(Id, DisplayName, TotalViews)][order(-TotalViews)][1:10]))
}

In [19]:
comp(
    sql_3(Posts,Users),
    base_3(Posts, Users),
    dplyr_3(Posts, Users),
    table_3(Posts, Users)
)

[1] "COMPARING GIVEN OUTPUTS TO SQL ONE: "
[1] "BASE: TRUE"
[1] "DPLYR: TRUE"
[1] "TABLE: TRUE"


In [20]:
table_3(Posts, Users)

Id,DisplayName,TotalViews
<int>,<chr>,<int>
101,Mark Mayo,2151776
9009,JonathanReez,1941733
693,RoflcoptrException,1865428
140,hippietrail,1849585
26,Gagravarr,1536327
3736,nsn,1148040
396,jrdioko,983986
10051,Nean Der Thal,894454
583,Jonik,855538
1893,lambshaanxy,850706
