# Extracting and Repurposing Data
The aim of this Activity is to extract the data from a European Soccer Database and repurpose it for a question-answering system just like the Amazon Alexa. I am using the Dplyr library in R to reach this goal.

In [20]:
library(dplyr)

## Initializing Database and required tables
For the purpose of this activity, I will need to connect to a database and read the player, player attributes, team, team attributes and match tables.

In [2]:
db <- src_sqlite(path = "database.sqlite", create = F)

In [3]:
player_tbl <- tbl(src = db, "player")
player_attr_tbl <- tbl(db, "player_attributes")
team_tbl <- tbl(db, "team")
team_attr_tbl <- tbl(db, "team_attributes")
match_tbl <- tbl(db, "match")

## Selecting the latest Date for every player
Since all players have more than one entry for different dates, it is essential to remove the unwanted data by selecting only the latest rating for each of the player.

In [4]:
player_tbl_grouped <- group_by(player_attr_tbl, player_api_id)
player_tbl_single_date <- summarise(player_tbl_grouped, date = max(date))
player_with_date <- inner_join(player_tbl, player_tbl_single_date, by ="player_api_id")
all_player_tbl <- inner_join(player_with_date, player_attr_tbl, by = c("player_api_id", "date"), copy = F)

## Calculating the GK Rating for each Player

In [5]:
all_player_tbl <- mutate(all_player_tbl, gk_rating = gk_diving + gk_handling + gk_kicking + gk_positioning + gk_reflexes)

## Selecting only the required columns
Since we need only a few columns from the entire data set, I am selecting them here.

In [6]:
all_player_tbl <- select(all_player_tbl, player_api_id, player_name, overall_rating, date, gk_rating)

## Finding Number of Appearances of every Player
For finding the number of appearances of every player, I have queried the 22 different columns in the match table using which I got the appearance of every player in every match.

In [7]:
home_player1 <- select(collect(count(group_by(match_tbl, home_player_1))), player_api_id = home_player_1, appearance = n)
home_player2 <- select(collect(count(group_by(match_tbl, home_player_2))), player_api_id = home_player_2, appearance = n)
home_player3 <- select(collect(count(group_by(match_tbl, home_player_3))), player_api_id = home_player_3, appearance = n)
home_player4 <- select(collect(count(group_by(match_tbl, home_player_4))), player_api_id = home_player_4, appearance = n)
home_player5 <- select(collect(count(group_by(match_tbl, home_player_5))), player_api_id = home_player_5, appearance = n)
home_player6 <- select(collect(count(group_by(match_tbl, home_player_6))), player_api_id = home_player_6, appearance = n)
home_player7 <- select(collect(count(group_by(match_tbl, home_player_7))), player_api_id = home_player_7, appearance = n)
home_player8 <- select(collect(count(group_by(match_tbl, home_player_8))), player_api_id = home_player_8, appearance = n)
home_player9 <- select(collect(count(group_by(match_tbl, home_player_9))), player_api_id = home_player_9, appearance = n)
home_player10 <- select(collect(count(group_by(match_tbl, home_player_10))), player_api_id = home_player_10, appearance = n)
home_player11 <- select(collect(count(group_by(match_tbl, home_player_11))), player_api_id = home_player_11, appearance = n)

away_player1 <- select(collect(count(group_by(match_tbl, away_player_1))), player_api_id = away_player_1, appearance = n)
away_player2 <- select(collect(count(group_by(match_tbl, away_player_2))), player_api_id = away_player_2, appearance = n)
away_player3 <- select(collect(count(group_by(match_tbl, away_player_3))), player_api_id = away_player_3, appearance = n)
away_player4 <- select(collect(count(group_by(match_tbl, away_player_4))), player_api_id = away_player_4, appearance = n)
away_player5 <- select(collect(count(group_by(match_tbl, away_player_5))), player_api_id = away_player_5, appearance = n)
away_player6 <- select(collect(count(group_by(match_tbl, away_player_6))), player_api_id = away_player_6, appearance = n)
away_player7 <- select(collect(count(group_by(match_tbl, away_player_7))), player_api_id = away_player_7, appearance = n)
away_player8 <- select(collect(count(group_by(match_tbl, away_player_8))), player_api_id = away_player_8, appearance = n)
away_player9 <- select(collect(count(group_by(match_tbl, away_player_9))), player_api_id = away_player_9, appearance = n)
away_player10 <- select(collect(count(group_by(match_tbl, away_player_10))), player_api_id = away_player_10, appearance = n)
away_player11 <- select(collect(count(group_by(match_tbl, away_player_11))), player_api_id = away_player_11, appearance = n)

## Combining all the Results into one and then calculating the total appearance
After combining all the results into one, I am joining it with the all_player_tbl which stores all the required data

In [8]:
home_players <- rbind(home_player1, home_player2, home_player3, home_player4, home_player5, home_player6, home_player7, home_player8, home_player9, home_player10, home_player11)
away_players <- rbind(away_player1, away_player2, away_player3, away_player4, away_player5, away_player6, away_player7, away_player8, away_player9, away_player10, away_player11)

player_appearance <- rbind(home_players,away_players)
player_appearance <- distinct(mutate(group_by(player_appearance, player_api_id), appearance = sum(appearance)), player_api_id, .keep_all = T)
player_appearance <- filter(player_appearance, player_api_id != "NA")

In [9]:
all_player_tbl <- inner_join(all_player_tbl, player_appearance, by ="player_api_id", copy = T)

## Calculating the Number of leagues a Player has played in

In [10]:
home_league_1 <- collect(group_by(match_tbl, home_player_1))
home_league_2 <- collect(group_by(match_tbl, home_player_2))
home_league_3 <- collect(group_by(match_tbl, home_player_3))
home_league_4 <- collect(group_by(match_tbl, home_player_4))
home_league_5 <- collect(group_by(match_tbl, home_player_5))
home_league_6 <- collect(group_by(match_tbl, home_player_6))
home_league_7 <- collect(group_by(match_tbl, home_player_7))
home_league_8 <- collect(group_by(match_tbl, home_player_8))
home_league_9 <- collect(group_by(match_tbl, home_player_9))
home_league_10 <- collect(group_by(match_tbl, home_player_10))
home_league_11 <- collect(group_by(match_tbl, home_player_11))

away_league_1 <- collect(group_by(match_tbl, away_player_1))
away_league_2 <- collect(group_by(match_tbl, away_player_2))
away_league_3 <- collect(group_by(match_tbl, away_player_3))
away_league_4 <- collect(group_by(match_tbl, away_player_4))
away_league_5 <- collect(group_by(match_tbl, away_player_5))
away_league_6 <- collect(group_by(match_tbl, away_player_6))
away_league_7 <- collect(group_by(match_tbl, away_player_7))
away_league_8 <- collect(group_by(match_tbl, away_player_8))
away_league_9 <- collect(group_by(match_tbl, away_player_9))
away_league_10 <- collect(group_by(match_tbl, away_player_10))
away_league_11 <- collect(group_by(match_tbl, away_player_11))

## Selecting all the required columns
For each of the 22 queries for the leagues, collecting the required columns, such as player_api_id and the league_id to process later

In [11]:
home_league_1 <- select(home_league_1, player_api_id = home_player_1, league_id)
home_league_2 <- select(home_league_2, player_api_id = home_player_2, league_id)
home_league_3 <- select(home_league_3, player_api_id = home_player_3, league_id)
home_league_4 <- select(home_league_4, player_api_id = home_player_4, league_id)
home_league_5 <- select(home_league_5, player_api_id = home_player_5, league_id)
home_league_6 <- select(home_league_6, player_api_id = home_player_6, league_id)
home_league_7 <- select(home_league_7, player_api_id = home_player_7, league_id)
home_league_8 <- select(home_league_8, player_api_id = home_player_8, league_id)
home_league_9 <- select(home_league_9, player_api_id = home_player_9, league_id)
home_league_10 <- select(home_league_10, player_api_id = home_player_10, league_id)
home_league_11 <- select(home_league_11, player_api_id = home_player_11, league_id)

away_league_1 <- select(away_league_1, player_api_id = away_player_1, league_id)
away_league_2 <- select(away_league_2, player_api_id = away_player_2, league_id)
away_league_3 <- select(away_league_3, player_api_id = away_player_3, league_id)
away_league_4 <- select(away_league_4, player_api_id = away_player_4, league_id)
away_league_5 <- select(away_league_5, player_api_id = away_player_5, league_id)
away_league_6 <- select(away_league_6, player_api_id = away_player_6, league_id)
away_league_7 <- select(away_league_7, player_api_id = away_player_7, league_id)
away_league_8 <- select(away_league_8, player_api_id = away_player_8, league_id)
away_league_9 <- select(away_league_9, player_api_id = away_player_9, league_id)
away_league_10 <- select(away_league_10, player_api_id = away_player_10, league_id)
away_league_11 <- select(away_league_11, player_api_id = away_player_11, league_id)

## Combining all the Leagues for every player
Combining the leagues for every player and then counting the distinct leagues that the player has played in. After that, it is again combined with the all_player_tbl which stores all the required player data.

In [12]:
home_leagues <- rbind(home_league_1, home_league_2, home_league_3, home_league_4, home_league_5, home_league_6, home_league_7, home_league_8, home_league_9, home_league_10, home_league_11)
away_leagues <- rbind(away_league_1, away_league_2, away_league_3, away_league_4, away_league_5, away_league_6, away_league_7, away_league_8, away_league_9, away_league_10, away_league_11)
player_leagues <- rbind(home_leagues, away_leagues)

player_leagues_counted <- select(count(distinct(player_leagues, league_id, .keep_all = T)), player_api_id, leagues = n)
all_player_tbl <- inner_join(all_player_tbl, player_leagues_counted, by ="player_api_id", copy = T)


## Writing Data to CSV File
Writing the all_player_tbl as a data frame into a CSV file.

In [13]:
all_player_df <- collect(all_player_tbl)
outputFilename <- "player.csv"
write.csv(all_player_df, outputFilename, row.names = FALSE)

## Working with the Team Table
Working with the team and team attributes table to extract the required data
* Selecting latest team rating and combining it with the team table

In [14]:
team_date <- group_by(team_attr_tbl, team_api_id)
team_date <- summarise(team_date, date = max(date))
team_with_date <- inner_join(team_tbl, team_date, by ="team_api_id")
all_team_data <- inner_join(team_with_date, team_attr_tbl, by = c("team_api_id", "date"), copy = F)

## Calculating Team Parameters
Calculating the team build up, chance creation and the defense scores for each and every team and after that selecting only the required columns from the entire table.

In [15]:
all_team_data <- mutate(all_team_data, buildUp = buildUpPlaySpeed + buildUpPlayDribbling + buildUpPlayPassing, chanceCreation = chanceCreationPassing + chanceCreationCrossing + chanceCreationShooting, defence = defencePressure + defenceAggression + defenceTeamWidth)
all_team_data <- mutate(all_team_data, chanceCreation = chanceCreationPassing + chanceCreationCrossing + chanceCreationShooting)
all_team_data <- mutate(all_team_data, defense = defencePressure + defenceAggression + defenceTeamWidth)

all_team_data <- select(all_team_data, team_api_id, team_long_name, buildUp, chanceCreation, defense)

## Calculating Goals for every Team
Calculating the number of goals that each and every team has made and then combining it with the all_team_data

In [16]:
match_home <- collect(group_by(select(match_tbl, team_api_id = home_team_api_id, home_team_goal),team_api_id))
match_away <- collect(group_by(select(match_tbl, team_api_id = away_team_api_id, away_team_goal),team_api_id))

In [17]:
home_goal_sum <- mutate(match_home, goals = sum(home_team_goal))
away_goal_sum <- mutate(match_away, goals = sum(away_team_goal))
home_goal_sum <- within(home_goal_sum, rm("home_team_goal"))
away_goal_sum <- within(away_goal_sum, rm("away_team_goal"))
home_goal_sum <- unique(home_goal_sum)
away_goal_sum <- unique(away_goal_sum)

match_goal <- rbind(home_goal_sum, away_goal_sum)
match_goal <- group_by(match_goal, team_api_id)
match_goal <- unique(mutate(match_goal, goals = sum(goals)))

In [18]:
all_team_data <- collect(inner_join(all_team_data, match_goal, by = "team_api_id", copy = T))

## Writing Data to CSV File
Writing the all_player_tbl as a data frame into a CSV file.

In [19]:
all_team_df <- collect(all_team_data)
outputFilename <- "team.csv"
write.csv(all_team_df, outputFilename, row.names = FALSE)