In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load Databases
fd = 'D:\Descargas\Proyecto Final\Resources'
appearances = f'{fd}/appearances.csv'
clubs = f'{fd}/clubs.csv'
competitions = f'{fd}/competitions.csv' # Exclude this for our database
games = f'{fd}/games.csv'
players = f'{fd}/players.csv'
top250 = f'{fd}/top250.csv'

In [3]:
# Creating pandas dataframes
appearances_df = pd.read_csv(appearances)
clubs_df = pd.read_csv(clubs)
games_df = pd.read_csv(games)
players_df = pd.read_csv(players)
top250_df = pd.read_csv(top250)

## Dropping columns from original Datasets

In [4]:
# Mantaining columns in appereances df
appearances_df = appearances_df[['player_id', 'appearance_id', 'game_id', 'goals', 'assists', \
'minutes_played', 'yellow_cards', 'red_cards']]
appearances_df.head()

Unnamed: 0,player_id,appearance_id,game_id,goals,assists,minutes_played,yellow_cards,red_cards
0,410184,2602706_410184,2602706,0,0,45,0,0
1,84938,2581152_84938,2581152,0,0,90,0,0
2,84938,2581178_84938,2581178,0,0,90,0,0
3,84938,2581188_84938,2581188,0,0,90,0,0
4,84938,2581196_84938,2581196,0,0,90,0,0


In [5]:
# Mantaining columns ins games df
games_df = games_df[['game_id', 'season']]
games_df.head()

Unnamed: 0,game_id,season
0,2581147,2015
1,2576512,2015
2,2581181,2015
3,2581179,2015
4,2581178,2015


In [6]:
# Mantaining columns in clubs df
clubs_df = clubs_df[['club_id', 'pretty_name', 'total_market_value']]
clubs_df.rename(columns={"pretty_name": "club_name"}, inplace=True)
clubs_df.head()

Unnamed: 0,club_id,club_name,total_market_value
0,6996,Goverla Uzhgorod,
1,6994,Metalurg Zaporizhya Bis 2016,
2,7185,Panthrakikos Komotini,23.0
3,3216,Mersin Idmanyurdu,
4,28956,Ael Kalloni,


In [7]:
# Mantaining columns in players df
players_df = players_df[['player_id', 'pretty_name', 'current_club_id', 'country_of_birth', \
                         'country_of_citizenship', 'position']]
players_df.rename(columns={"pretty_name": "name",
                          "current_club_id": "club_id"}, inplace=True)
players_df.head()

Unnamed: 0,player_id,name,club_id,country_of_birth,country_of_citizenship,position
0,257767,Aloy Ihenacho,4795,Nigeria,Nigeria,Attack
1,320889,Thomas Blomeyer,4795,Germany,Germany,Defender
2,42318,Konstantin Engel,4795,UdSSR,Kazakhstan,Defender
3,94598,Stefan Wannenwetsch,4795,Germany,Germany,Midfield
4,381921,Ivan Komlev,6994,,Ukraine,Defender


## Dropping columns from Top250 dataset

In [8]:
top250_df = top250_df[['Name', 'Transfer_fee', 'Age', 'Team_from', 'League_from', 'Team_to', \
                      'Market_value', 'Season']]
top250_df.rename(columns={"Name": "name",
                         "Transfer_fee": "transfer_fee",
                         "Age": "age",
                         "Team_from": "team_from",
                         "League_from": "league_from",
                         "Team_to": "team_to",
                         "Market_value": "market_value",
                         "Season": "season"}, inplace=True)
top250_df.head()

Unnamed: 0,name,transfer_fee,age,team_from,league_from,team_to,market_value,season
0,Luís Figo,60000000,27,FC Barcelona,LaLiga,Real Madrid,,2000-2001
1,Hernán Crespo,56810000,25,Parma,Serie A,Lazio,,2000-2001
2,Marc Overmars,40000000,27,Arsenal,Premier League,FC Barcelona,,2000-2001
3,Gabriel Batistuta,36150000,31,Fiorentina,Serie A,AS Roma,,2000-2001
4,Nicolas Anelka,34500000,21,Real Madrid,LaLiga,Paris SG,,2000-2001


## Dropping NaN values

In [9]:
# Dropping NaN values
appearances_df = appearances_df.dropna()
clubs_df = clubs_df.dropna()
games_df = games_df.dropna()
players_df = players_df.dropna()
top250_df = top250_df.dropna()

## Merging Original Datasets
#### Merge game_df and appearances_df
#### Filter by Season (2018)

In [10]:
appear_and_games = pd.merge(games_df, appearances_df, how="inner", on=["game_id", "game_id"])
appear_and_games = appear_and_games.loc[(appear_and_games["season"] == 2018) | (appear_and_games["season"] == 2017)]
appear_and_games.drop(columns=['season'], inplace=True)
appear_and_games.head()

Unnamed: 0,game_id,player_id,appearance_id,goals,assists,minutes_played,yellow_cards,red_cards
263138,2871495,118847,2871495_118847,0,0,90,1,0
263139,2871495,166601,2871495_166601,1,0,90,0,0
263140,2871495,190393,2871495_190393,1,0,90,0,0
263141,2871495,57051,2871495_57051,0,1,90,0,0
263142,2871495,58358,2871495_58358,0,0,61,0,0


#### Merge resulting dataset and players_df

In [11]:
players_in_2018 = pd.merge(players_df, appear_and_games, how="inner", on=["player_id", "player_id"])
players_in_2018.head()

Unnamed: 0,player_id,name,club_id,country_of_birth,country_of_citizenship,position,game_id,appearance_id,goals,assists,minutes_played,yellow_cards,red_cards
0,85763,Ricardo Dias,3349,Portugal,Portugal,Midfield,3076804,3076804_85763,0,0,90,0,0
1,278459,Simon Lorenz,533,Germany,Germany,Defender,2919913,2919913_278459,0,0,90,0,0
2,296188,Alexander Rossipal,533,Germany,Germany,Defender,2919913,2919913_296188,0,0,90,0,0
3,284047,Meris Skenderovic,533,Germany,Montenegro,Attack,2919913,2919913_284047,0,0,34,0,0
4,19548,Eugen Polanski,533,Poland,Poland,Midfield,2872098,2872098_19548,0,0,10,0,0


#### Merging players_2018 with clubs_df

In [12]:
players_in_2018 = pd.merge(players_in_2018, clubs_df, how="inner", on=["club_id", "club_id"])
players_in_2018.head()

Unnamed: 0,player_id,name,club_id,country_of_birth,country_of_citizenship,position,game_id,appearance_id,goals,assists,minutes_played,yellow_cards,red_cards,club_name,total_market_value
0,85763,Ricardo Dias,3349,Portugal,Portugal,Midfield,3076804,3076804_85763,0,0,90,0,0,Cd Feirense,5.33
1,24956,Barge,3349,Portugal,Portugal,Defender,2883640,2883640_24956,0,0,39,0,0,Cd Feirense,5.33
2,24956,Barge,3349,Portugal,Portugal,Defender,2883619,2883619_24956,0,0,90,1,0,Cd Feirense,5.33
3,24956,Barge,3349,Portugal,Portugal,Defender,2883606,2883606_24956,0,0,90,1,0,Cd Feirense,5.33
4,257707,Joao Graca,3349,Portugal,Portugal,Midfield,2884478,2884478_257707,0,0,61,0,0,Cd Feirense,5.33


## Top250 Dataset Preprocessing
#### Filter by Seasons "2017-2018" and "2018-2019"

In [13]:
top250_season_2018 = top250_df[(top250_df['season'] == "2017-2018") | (top250_df['season'] == "2018-2019")]
top250_season_2018.drop(columns=['season'], inplace=True)
top250_season_2018.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,name,transfer_fee,age,team_from,league_from,team_to,market_value
4211,Neymar,222000000,25,FC Barcelona,LaLiga,Paris SG,100000000.0
4212,Philippe Coutinho,125000000,25,Liverpool,Premier League,FC Barcelona,90000000.0
4213,Ousmane Dembélé,115000000,20,Bor. Dortmund,1.Bundesliga,FC Barcelona,33000000.0
4214,Romelu Lukaku,84700000,24,Everton,Premier League,Man Utd,50000000.0
4215,Virgil van Dijk,78800000,26,Southampton,Premier League,Liverpool,30000000.0


## Merging Original Datasets and Top250 Datasets

In [14]:
mix_df =  pd.merge(top250_season_2018, players_in_2018, how="inner", on=["name", "name"])
mix_df.drop(columns=["club_name"], inplace=True)
mix_df.rename(columns={'total_market_value': 'club_market_value'}, inplace=True)
mix_df.head()

Unnamed: 0,name,transfer_fee,age,team_from,league_from,team_to,market_value,player_id,club_id,country_of_birth,country_of_citizenship,position,game_id,appearance_id,goals,assists,minutes_played,yellow_cards,red_cards,club_market_value
0,Neymar,222000000,25,FC Barcelona,LaLiga,Paris SG,100000000.0,68290,583,Brazil,Brazil,Attack,2942689,2942689_68290,1,0,90,1,0,891.18
1,Neymar,222000000,25,FC Barcelona,LaLiga,Paris SG,100000000.0,68290,583,Brazil,Brazil,Attack,2942778,2942778_68290,2,0,90,0,0,891.18
2,Neymar,222000000,25,FC Barcelona,LaLiga,Paris SG,100000000.0,68290,583,Brazil,Brazil,Attack,2942817,2942817_68290,0,1,90,0,0,891.18
3,Neymar,222000000,25,FC Barcelona,LaLiga,Paris SG,100000000.0,68290,583,Brazil,Brazil,Attack,2942806,2942806_68290,1,1,90,0,0,891.18
4,Neymar,222000000,25,FC Barcelona,LaLiga,Paris SG,100000000.0,68290,583,Brazil,Brazil,Attack,2942795,2942795_68290,0,0,90,0,0,891.18


## Obtaining performance dataset
#### Obtain the sum of goals, assists, minutes_played, yellow_cards, red_cards
#### Obtain number of appearances and games played
#### Merge both datasets and obtain the performance datasets

In [15]:
score_df = mix_df.groupby(['name']).sum()[['goals', 'assists', 'minutes_played', 'yellow_cards', \
                                        'red_cards']]
score_df

Unnamed: 0_level_0,goals,assists,minutes_played,yellow_cards,red_cards
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Aaron Mooy,7,4,5435,8,0
Abdou Diallo,4,2,5943,7,1
Adam Masina,0,3,4070,14,1
Adam Ounas,5,1,1120,2,0
Adama Diakhaby,4,6,3262,10,2
...,...,...,...,...,...
Yuning Zhang,0,0,117,0,0
Yuri Berchiche,8,8,10388,24,0
Yuya Osako,11,6,3780,6,0
Yves Bissouma,2,1,3414,9,0


In [16]:
plays_df = mix_df.groupby(['name']).count()[['game_id', 'appearance_id']]
plays_df 

Unnamed: 0_level_0,game_id,appearance_id
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Aaron Mooy,67,67
Abdou Diallo,68,68
Adam Masina,51,51
Adam Ounas,39,39
Adama Diakhaby,78,78
...,...,...
Yuning Zhang,6,6
Yuri Berchiche,122,122
Yuya Osako,55,55
Yves Bissouma,53,53


In [17]:
performance_df = pd.merge(score_df, plays_df, left_index=True, right_index=True, how="left")
performance_col = performance_df.columns.tolist()
performance_df.rename(columns={"game_id": "games",
                              "appearance_id": "appearances"}, inplace=True)
performance_df

Unnamed: 0_level_0,goals,assists,minutes_played,yellow_cards,red_cards,games,appearances
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Aaron Mooy,7,4,5435,8,0,67,67
Abdou Diallo,4,2,5943,7,1,68,68
Adam Masina,0,3,4070,14,1,51,51
Adam Ounas,5,1,1120,2,0,39,39
Adama Diakhaby,4,6,3262,10,2,78,78
...,...,...,...,...,...,...,...
Yuning Zhang,0,0,117,0,0,6,6
Yuri Berchiche,8,8,10388,24,0,122,122
Yuya Osako,11,6,3780,6,0,55,55
Yves Bissouma,2,1,3414,9,0,53,53


## Obtaining clean datasets of each unique player
#### Drop performance related columns in mix_df
#### Drop repeated values in the new df
####  Merging performance_df and new

In [18]:
df = mix_df.drop(columns=performance_col)
df.drop_duplicates(inplace=True)
df

Unnamed: 0,name,transfer_fee,age,team_from,league_from,team_to,market_value,player_id,club_id,country_of_birth,country_of_citizenship,position,club_market_value
0,Neymar,222000000,25,FC Barcelona,LaLiga,Paris SG,100000000.0,68290,583,Brazil,Brazil,Attack,891.18
51,Philippe Coutinho,125000000,25,Liverpool,Premier League,FC Barcelona,90000000.0,80444,131,Brazil,Brazil,Midfield,604.80
147,Romelu Lukaku,84700000,24,Everton,Premier League,Man Utd,50000000.0,96341,631,Belgium,Belgium,Attack,801.45
234,Diego Costa,66000000,29,Chelsea,Premier League,Atlético Madrid,50000000.0,44779,13,Brazil,Spain,Attack,671.31
278,Aymeric Laporte,65000000,23,Athletic Bilbao,LaLiga,Man City,25000000.0,176553,281,France,Spain,Defender,969.57
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18325,Giangiacomo Magnani,5000000,22,Perugia,Serie B,Juventus,600000.0,310458,276,Italy,Italy,Defender,98.42
18347,Jasmin Kurtic,4800000,29,Atalanta,Serie A,SPAL,5000000.0,85825,1091,Jugoslawien (SFR),Slovenia,Midfield,53.10
18413,Silvan Widmer,4500000,25,Udinese Calcio,Serie A,FC Basel,8500000.0,168989,39,Switzerland,Switzerland,Defender,95.58
18439,Yuya Osako,4500000,28,1. FC Köln,2.Bundesliga,Werder Bremen,4500000.0,108650,86,Japan,Japan,Attack,73.24


In [19]:
data = pd.merge(df, performance_df, how="right", on = ["name", "name"])
data

Unnamed: 0,name,transfer_fee,age,team_from,league_from,team_to,market_value,player_id,club_id,country_of_birth,country_of_citizenship,position,club_market_value,goals,assists,minutes_played,yellow_cards,red_cards,games,appearances
0,Aaron Mooy,9100000,26,Man City,Premier League,Huddersfield,5000000.0,123951,1237,Australia,Australia,Midfield,231.30,7,4,5435,8,0,67,67
1,Abdou Diallo,28000000,22,1.FSV Mainz 05,1.Bundesliga,Bor. Dortmund,12000000.0,229005,583,France,Senegal,Defender,891.18,4,2,5943,7,1,68,68
2,Adam Masina,5000000,24,Bologna,Serie A,Watford,7000000.0,286949,1010,Morocco,Morocco,Defender,130.95,0,3,4070,14,1,51,51
3,Adam Ounas,10000000,20,G. Bordeaux,Ligue 1,SSC Napoli,4000000.0,400485,6195,France,Algeria,Attack,466.70,5,1,1120,2,0,39,39
4,Adama Diakhaby,10000000,21,Stade Rennais,Ligue 1,Monaco,1500000.0,453594,1110,France,France,Attack,30.69,4,6,3262,10,2,78,78
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,Yuri Berchiche,16000000,27,Real Sociedad,LaLiga,Paris SG,10000000.0,61812,621,Spain,Spain,Defender,182.07,8,8,10388,24,0,122,122
361,Yuri Berchiche,24000000,28,Paris SG,Ligue 1,Athletic Bilbao,17000000.0,61812,621,Spain,Spain,Defender,182.07,8,8,10388,24,0,122,122
362,Yuya Osako,4500000,28,1. FC Köln,2.Bundesliga,Werder Bremen,4500000.0,108650,86,Japan,Japan,Attack,73.24,11,6,3780,6,0,55,55
363,Yves Bissouma,16900000,21,LOSC Lille,Ligue 1,Brighton,8000000.0,410425,1237,Cote d'Ivoire,Mali,Midfield,231.30,2,1,3414,9,0,53,53


## Exporting data to csv

In [20]:
data.to_csv('data_features.csv', index=False)