## Activity 2: Extract and Repurpose Data

### Input: 
Kagge Sqlite European Soccer Database 

### Output: 
R data frames persisted in csv files to facilitate easy Q&A

### Process: 
sqllite to R data frames to csv.

In [1]:
install.packages("RSQLite")

“installation of package ‘RSQLite’ had non-zero exit status”Updating HTML index of packages in '.Library'
Making 'packages.html' ... done


In [2]:
install.packages("dplyr")

“installation of package ‘dplyr’ had non-zero exit status”Updating HTML index of packages in '.Library'
Making 'packages.html' ... done


In [3]:
library(RSQLite)
library(dplyr)
library(DBI)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



In [4]:
#Connect to Database
con <- dbConnect(drv=RSQLite::SQLite(), dbname="database.sqlite")

In [5]:
# List all tables
alltables <- dbListTables(con)

In [6]:
print(alltables)

[1] "Country"           "League"            "Match"            
[4] "Player"            "Player_Attributes" "Team"             
[7] "Team_Attributes"   "sqlite_sequence"  


In [7]:
##Create a data.frame for each table

country <- dbGetQuery( con,'select * from country' )
league <- dbGetQuery( con,'select * from league' )
player <- dbGetQuery( con,'select * from player' )
player_attributes <- dbGetQuery( con,'select * from player_attributes' )
team <- dbGetQuery( con,'select * from team' )
team_attributes <- dbGetQuery( con,'select * from team_attributes' )
match <- dbGetQuery( con,'select * from match' )

### Steps to create a Player.csv

In [8]:
# Extracting all the columns which are to be taken directly.

result <- tbl_df(player_attributes) %>% select(player_api_id, player_fifa_api_id, overall_rating, gk_diving, gk_handling, gk_kicking, gk_positioning,  gk_reflexes ) %>% group_by(player_api_id, player_fifa_api_id) %>% top_n(1, overall_rating)
result <- distinct(result, player_api_id, .keep_all = TRUE)
result <- result %>% mutate(gk_total = gk_diving + gk_handling + gk_kicking + gk_positioning + gk_reflexes)

result <- full_join(x=result, y=player, by ="player_api_id")
result <- result[,c("player_api_id", "player_fifa_api_id.x","player_name","birthday", "overall_rating", "gk_total")]
collect(result)

player_api_id,player_fifa_api_id.x,player_name,birthday,overall_rating,gk_total
505942,218353,Aaron Appindangoye,1992-02-29 00:00:00,67,43
155782,189615,Aaron Cresswell,1989-12-15 00:00:00,74,51
162549,186170,Aaron Doran,1991-05-13 00:00:00,71,56
30572,140161,Aaron Galindo,1982-05-08 00:00:00,75,111
23780,17725,Aaron Hughes,1979-11-08 00:00:00,78,129
27316,158138,Aaron Hunt,1986-09-04 00:00:00,79,60
564793,221280,Aaron Kuhl,1996-01-30 00:00:00,61,64
30895,152747,Aaron Lennon,1987-04-16 00:00:00,84,146
528212,206592,Aaron Lennox,1993-02-19 00:00:00,48,237
101042,188621,Aaron Meijers,1987-10-28 00:00:00,69,54


#### Players by number of appearances (with any team they played with)

In [15]:
#Processing the home_player and away_player columns to get the player wise appearances

all_players <- count(match, home_player_1)
names(all_players) <- c("player_api_id", "1")

for(i in 1:21){ 

    temp <- select(match, 56+i)
    names(temp) <- c("player_api_id")
    temp <- count(temp, player_api_id)
    names(temp) <- c("player_api_id", i+1)
    
    all_players <- full_join(x=all_players, y=temp, by ="player_api_id")
    
}

In [19]:
# Replacing all the NA columns with 0 value for easier calculations

appearances <- all_players %>% mutate_each(funs(replace(., which(is.na(.)), 0)))

In [20]:
# Summation of the number of appearances from all columns

appearances$n_appearances<-rowSums(all_players[,2:23])

In [21]:
# Joining the n_appearances column to rest of the data process uptill now

result <-  inner_join(x=result, y=appearances, by ="player_api_id")
result <- result[,c("player_api_id", "player_fifa_api_id.x", "player_name", "birthday", "overall_rating", "gk_total", "n_appearances")]

In [22]:
result <- subset(result, player_api_id != 0)
result

player_api_id,player_fifa_api_id.x,player_name,birthday,overall_rating,gk_total,n_appearances
505942,218353,Aaron Appindangoye,1992-02-29 00:00:00,67,43,8
155782,189615,Aaron Cresswell,1989-12-15 00:00:00,74,51,75
162549,186170,Aaron Doran,1991-05-13 00:00:00,71,56,104
30572,140161,Aaron Galindo,1982-05-08 00:00:00,75,111,10
23780,17725,Aaron Hughes,1979-11-08 00:00:00,78,129,162
27316,158138,Aaron Hunt,1986-09-04 00:00:00,79,60,158
564793,221280,Aaron Kuhl,1996-01-30 00:00:00,61,64,5
30895,152747,Aaron Lennon,1987-04-16 00:00:00,84,146,181
528212,206592,Aaron Lennox,1993-02-19 00:00:00,48,237,1
101042,188621,Aaron Meijers,1987-10-28 00:00:00,69,54,167


#### Number of leagues players played in

In [23]:
#Processing the league_id column with respect to home_player and away_player columns

leagues <- select(match, league_id, home_player_1)
names(leagues) <- c("league_id", "player_api_id")

for(i in 1:21){ 

    temp <- select(match, 3, 56+i) 
    temp <- data.frame(na.omit(temp))
    names(temp) <- c("league_id", "player_api_id")
    
    leagues <- bind_rows(leagues, temp)
    
}

In [24]:
#leagues %>% group_by(player_api_id) %>% summarise (mean_mpg = count(league_id))

leagues <- leagues %>% group_by(player_api_id)
leagues <- summarise(leagues, n_leagues = length(unique(league_id)))
leagues <- subset(leagues, player_api_id != "NA")

In [25]:
# Joining the n_leagues column to rest of the data process uptill now

result <-  full_join(x=result, y=leagues, by ="player_api_id")
result <- result[,c("player_api_id", "player_fifa_api_id.x", "player_name", "birthday", "overall_rating", "gk_total", "n_appearances", "n_leagues")]

In [26]:
result

player_api_id,player_fifa_api_id.x,player_name,birthday,overall_rating,gk_total,n_appearances,n_leagues
505942,218353,Aaron Appindangoye,1992-02-29 00:00:00,67,43,8,1
155782,189615,Aaron Cresswell,1989-12-15 00:00:00,74,51,75,1
162549,186170,Aaron Doran,1991-05-13 00:00:00,71,56,104,1
30572,140161,Aaron Galindo,1982-05-08 00:00:00,75,111,10,1
23780,17725,Aaron Hughes,1979-11-08 00:00:00,78,129,162,1
27316,158138,Aaron Hunt,1986-09-04 00:00:00,79,60,158,1
564793,221280,Aaron Kuhl,1996-01-30 00:00:00,61,64,5,1
30895,152747,Aaron Lennon,1987-04-16 00:00:00,84,146,181,1
528212,206592,Aaron Lennox,1993-02-19 00:00:00,48,237,1,1
101042,188621,Aaron Meijers,1987-10-28 00:00:00,69,54,167,1


In [27]:
# Convert the final dataframe to required format and then save as a csv file

my.df <- data.frame(lapply(result, as.character), stringsAsFactors=FALSE)
write.csv(my.df, file = "player.csv")

### Steps to create a Team.csv

In [28]:
# Extracting all the columns which are to be taken directly and aplying distinct to remove duplicates

team_final <- tbl_df(team_attributes) %>% select(team_api_id, team_fifa_api_id, 
                                                 buildUpPlaySpeed, buildUpPlayDribbling, buildUpPlayPassing, 
                                                 chanceCreationPassing, chanceCreationCrossing, chanceCreationShooting, 
                                                 defencePressure, defenceAggression, defenceTeamWidth) %>% group_by(team_api_id, team_fifa_api_id)

team_final <- distinct(team_final, team_api_id, .keep_all = TRUE) 

In [29]:
# Make all NA cells 0 before performing any calculations 

team_final <- team_final %>% mutate_each(funs(replace(., which(is.na(.)), 0)))

# Add the respective columns to get Build, Chance and Defence values for each team

team_final <- team_final %>% mutate(build_total = buildUpPlaySpeed + buildUpPlayDribbling + buildUpPlayPassing, 
                                   chance_total = chanceCreationPassing + chanceCreationCrossing +chanceCreationShooting,
                                   defense_total = defencePressure + defenceAggression + defenceTeamWidth)

In [30]:
# Join team attributes with team table to get the team name

team_final <- full_join(x=team_final, y=team, by ="team_api_id")
team_final <- team_final[,c("team_api_id", "team_fifa_api_id.x","team_long_name","team_short_name", "build_total", "chance_total", "defense_total")]

In [31]:
# Calculate goals by all the home teams separately from match table

home_goals <- tbl_df(match) %>% select(home_team_api_id, home_team_goal) %>% group_by(home_team_api_id)
home_goals <- distinct(home_goals, home_team_api_id, .keep_all = TRUE)
names(home_goals) <- c("team_api_id", "goals")

In [32]:
# Calculate goals by all the away teams separately from match table

away_goals <- tbl_df(match) %>% select(away_team_api_id, away_team_goal) %>% group_by(away_team_api_id)
away_goals <- distinct(away_goals, away_team_api_id, .keep_all = TRUE)
names(away_goals) <- c("team_api_id", "goals")

In [33]:
# Join home and away goal tables and get cummulative goals for each team

goals <- full_join(home_goals, away_goals, by = "team_api_id") 
goals <- tbl_df(goals) %>% mutate(goal_total = goals.x + goals.y)

In [34]:
# Perform final join to add the total_goals column for each team in team_final dataframe

team_final <- full_join(x=team_final, y=goals, by ="team_api_id")
team_final <- distinct(team_final, team_api_id, .keep_all = TRUE)

In [35]:
# Taking only the required columns 

team_final <- team_final[,c("team_api_id", "team_fifa_api_id.x","team_long_name","team_short_name", "build_total", "chance_total", "defense_total", "goal_total")]
collect(team_final)

team_api_id,team_fifa_api_id.x,team_long_name,team_short_name,build_total,chance_total,defense_total,goal_total
9930,434,FC Aarau,AAR,110,180,150,4
8485,77,Aberdeen,ABE,140,210,200,1
8576,614,AC Ajaccio,AJA,100,135,130,0
8564,47,Milan,ACM,75,170,125,2
10215,1901,Académica de Coimbra,ACA,60,165,90,2
10217,650,ADO Den Haag,HAA,65,145,90,5
8593,245,Ajax,AJA,60,180,90,6
9865,1861,UD Almería,ALM,110,160,165,5
8635,229,RSC Anderlecht,AND,85,180,190,5
8121,1530,Angers SCO,ANG,80,145,115,2


In [36]:
# Convert the final dataframe to required format and then save as a csv file

my.df <- data.frame(lapply(team_final, as.character), stringsAsFactors=FALSE)
write.csv(my.df, file = "team.csv")

### References:

[1] http://stackoverflow.com/questions/9802680/importing-files-with-extension-sqlite-into-r

[2] http://stat545.com/bit001_dplyr-cheatsheet.html

[3] https://cran.r-project.org/web/packages/dplyr/vignettes/two-table.html

[4] https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf

[5] https://cran.r-project.org/web/packages/dplyr/vignettes/databases.html

[6] https://cran.r-project.org/web/packages/dplyr/vignettes/two-table.html

[7] https://rpubs.com/bradleyboehmke/data_wrangling