<a href="https://colab.research.google.com/github/Ferrariagustinpablo/Data-Analytics-in-R/blob/main/Merge_in_R.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libraries

In [1]:
install.packages("data.table")

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



In [2]:
install.packages("bikeshare14")

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



In [3]:
# Load
library(dplyr)
library(ggplot2)
library(tidyr)

# Specially in this notebook we are going to use data tables
library(data.table)
library(bikeshare14)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last




# merge()

## Inner and Full join

In [None]:
# Inner join netflix and imdb
merge(netflix, imdb, by = "title")

In [None]:
# Full join netflix and imdb
merge(netflix, imdb, by = "title", all = TRUE)

## Left and Right join

In [None]:
# Left join imdb to netflix
merge(netflix, imdb, by = "title", all.x = TRUE)

# Right join imdb to netflix
merge(netflix, imdb, by = "title", all.y = TRUE)

# Merge with Data.table synthax

DT[i, on]

## Right join

In [None]:
# Right join population to capitals using data.table syntax
capitals[population, on = .(city)]

# Right join using merge
merge(capitals, population, by = "city", all.y = TRUE)

## Inner / Outer join

In [None]:
# Inner join with the data.table syntax
capitals[population, on = .(city), nomatch = 0]

# Anti-joins
Anti-joins are useful when you want to filter rows in one table not found in the other.

In [None]:
# Anti-join capitals to population
population[!capitals, on = .(city)]

# Joining and computing with j

Using the j argument to compute information from the result of the join will allow you to quickly ask and answer questions about your data. 

In [None]:
# Join and sum
population[capitals, on = .(city), nomatch = 0,
           j = sum(percentage)]

# .EACHI

by = .EACHI will calculate the number of matches in the left data.table for each row in the right data.table.

In [None]:
# How many continents is each country listed in?
continents[life_exp, on = .(country), .N, 
           by = .EACHI]

# Chaining 

In [None]:
# Calculate average life expectancy per continent:
continents[life_exp, on = .(country), 
                                  nomatch = 0][, j = mean(years), 
                                               by = .(continent)]

# merging with different key names

## data.table syntax

In [None]:
# What are the correct join key columns?
students[guardians, on = .(guardian = name), nomatch = 0]


#or


# Identify and set the keys
join_key <- c("topic" = "subject")

# Right join
teachers[locations, on = join_key]

## merge syntax with suffixes

In [None]:
merge(x=parents, y=students, by.x="name", by.y="student", suffixes= c("parent", "child"))

In [None]:
# Left join with suffixes
merge(capital_pop, area, by = "state", all.x = TRUE, suffixes = c(".pop", ".area"))

# merging with more than one feature

## Changing key datatype to merge

In [None]:

# Right join
subjects[locations, on = .(subject, semester)]

# Fix the column class
locations[, semester := as.integer(semester)]

# Right join
subjects[locations, on = .(subject, semester)]

# Transforming dataFrame to Data.table

In [None]:
# Convert netflix to a data.table
netflix_dt <- as.data.table(netflix, keep.rownames = "series")

# Handling duplicates

## unique()

In [None]:
# Keep only the last probe for each gene
heart_3 <- unique(heart_2, by = "gene", fromLast = TRUE)


## duplicated()

In [None]:
heart <- heart[!duplicated(heart, by="gene", fromLast = TRUE)]

# Concatenating

## rbind()

In [None]:
# Concatenate case numbers from weeks 50 and 51
rbind(ebola_W50, ebola_W51)

# ebola_W52 lacks one of the columns
rbind(ebola_W50, ebola_W51, ebola_W52, fill = TRUE)

## rbindlist()

In [None]:
gdp_all_3 <- rbindlist(gdp, idcol = "continent", use.names = TRUE)

# Set operations: rows in both datasets.

## fintersect, funion and fsetdiff

In [None]:
# Obtain countries in both Asia and Europe
fintersect(gdp$asia, gdp$europe)

containing all countries in either Europe or Asia, with each country appearing only once in the result.

The difference between funion and rbindlist is tha funion gives a dataset without duplicates.

In [None]:
# Get all countries in either Asia or Europe
funion(gdp$asia, gdp$europe)

In [None]:
# Which countries are in Africa but not considered part of the middle east?
fsetdiff(gdp$africa, middle_east)

# Which countries are in Asia but not considered part of the middle east?
fsetdiff(gdp$asia, middle_east)

# Which countries are in Europe but not considered part of the middle east?
fsetdiff(gdp$europe, middle_east)


# OR make it in just one line (gdp is a list by continents):
lapply(gdp, fsetdiff, middle_east)

# From wide to long format

measure.vars specify columns to stack

In [None]:
# melt
melt(gdp_per_capita, id.vars = "year")

# With column rename
melt(gdp_per_capita, id.vars = "year", 
     variable.name = "country", value.name = "gdp_pc")

By default the "variable" column in the result will be a factor. You can change this by setting variable.factor = FALSE.

In [None]:
# Modify the code
melt(ebola_wide, measure.vars = c("Week_50", "Week_51"), 
     variable.name = "period", value.name = "cases", 
     id.vars = "Location")

Using id.vars together with measure.vars keeps only those columns in the result.

# From long to wide (data.table)

In [None]:
# Split the population column by year
dcast(gdp_oceania, formula = country ~ year, value.var = "population")

In [None]:
# Reshape from wide to long format
wide <- dcast(gdp_oceania, formula = country ~ year, value.var = c("gdp", "population"))

# convert to a matrix
as.matrix(wide, rownames = "country")