### Connecting to the European Soccer Database

In [1]:
import sqlite3
import pandas as pd
import numpy as np
import datetime

try:
    sqliteConnection = sqlite3.connect('database.sqlite')
    #cursor = sqliteConnection.cursor()
    print("Database created and Successfully Connected to SQLite")
except sqlite3.Error as error:
    print("Error while connecting to sqlite", error)

Database created and Successfully Connected to SQLite


### Populating the 7 tables in the database to 7 Pandas Dataframes

In [62]:
country_query = "SELECT * from Country"
country_df = pd.read_sql_query(country_query, sqliteConnection)

league_query = "SELECT * from League"
league_df = pd.read_sql_query(league_query, sqliteConnection)

match_query = "SELECT * from Match"
match_df = pd.read_sql_query(match_query, sqliteConnection)

player_query = "SELECT * from Player"
player_df = pd.read_sql_query(player_query, sqliteConnection)

playerAttributes_query = "SELECT * from Player_Attributes"
playerAttributes_df = pd.read_sql_query(playerAttributes_query, sqliteConnection)

team_query = "SELECT * from Team"
team_df = pd.read_sql_query(team_query, sqliteConnection)

teamAttributes_query = "SELECT * from Team_Attributes"
teamAttributes_df = pd.read_sql_query(teamAttributes_query, sqliteConnection)

### Cleaning the Country Dataframe

In [3]:
del country_df['id']
country_df.insert(0, 'Country_id', range(1, len(country_df)+1))
country_df = country_df.rename(columns={"name": "Country_Name"})
country_df

Unnamed: 0,Country_id,Country_Name
0,1,Belgium
1,2,England
2,3,France
3,4,Germany
4,5,Italy
5,6,Netherlands
6,7,Poland
7,8,Portugal
8,9,Scotland
9,10,Spain


### Cleaning the League Dataframe

In [4]:
del league_df['id']
del league_df['country_id']
league_df.insert(0, 'League_id', range(1, len(league_df)+1))
league_df = league_df.rename(columns={"name": "League_Name"})
league_df

Unnamed: 0,League_id,League_Name
0,1,Belgium Jupiler League
1,2,England Premier League
2,3,France Ligue 1
3,4,Germany 1. Bundesliga
4,5,Italy Serie A
5,6,Netherlands Eredivisie
6,7,Poland Ekstraklasa
7,8,Portugal Liga ZON Sagres
8,9,Scotland Premier League
9,10,Spain LIGA BBVA


### Cleaning the Match Dataframe

In [63]:
match_df = match_df.rename(columns={"id": "Match_id"})

#Replacing the old country_id values with the Country_Name values for better readability
match_df['country_id'] = match_df['country_id'].replace(1,'Belgium')
match_df['country_id'] = match_df['country_id'].replace(1729,'England')
match_df['country_id'] = match_df['country_id'].replace(4769,'France')
match_df['country_id'] = match_df['country_id'].replace(7809,'Germany')
match_df['country_id'] = match_df['country_id'].replace(10257,'Italy')
match_df['country_id'] = match_df['country_id'].replace(13274,'Netherlands')
match_df['country_id'] = match_df['country_id'].replace(15722,'Poland')
match_df['country_id'] = match_df['country_id'].replace(17642,'Portugal')
match_df['country_id'] = match_df['country_id'].replace(19694,'Scotland')
match_df['country_id'] = match_df['country_id'].replace(21518,'Spain')
match_df['country_id'] = match_df['country_id'].replace(24558,'Switzerland')
match_df = match_df.rename(columns={"country_id": "Country_Name"})

#Replacing the old league_id values with the League_Name values for better readability
match_df['league_id'] = match_df['league_id'].replace(1,'Belgium Jupiler League')
match_df['league_id'] = match_df['league_id'].replace(1729,'England Premier League')
match_df['league_id'] = match_df['league_id'].replace(4769,'France Ligue 1')
match_df['league_id'] = match_df['league_id'].replace(7809,'Germany 1. Bundesliga')
match_df['league_id'] = match_df['league_id'].replace(10257,'Italy Serie A')
match_df['league_id'] = match_df['league_id'].replace(13274,'Netherlands Eredivisie')
match_df['league_id'] = match_df['league_id'].replace(15722,'Poland Ekstraklasa')
match_df['league_id'] = match_df['league_id'].replace(17642,'Portugal Liga ZON Sagres')
match_df['league_id'] = match_df['league_id'].replace(19694,'Scotland Premier League')
match_df['league_id'] = match_df['league_id'].replace(21518,'Spain LIGA BBVA')
match_df['league_id'] = match_df['league_id'].replace(24558,'Switzerland Super League')
match_df = match_df.rename(columns={"league_id": "League_Name"})
match_df['date'] = pd.to_datetime(match_df['date'], format='%Y/%m/%d %H:%M:%S')
match_df.head()

Unnamed: 0,Match_id,Country_Name,League_Name,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_X1,home_player_X2,home_player_X3,home_player_X4,home_player_X5,home_player_X6,home_player_X7,home_player_X8,home_player_X9,home_player_X10,home_player_X11,away_player_X1,away_player_X2,away_player_X3,away_player_X4,away_player_X5,away_player_X6,away_player_X7,away_player_X8,away_player_X9,away_player_X10,away_player_X11,home_player_Y1,home_player_Y2,home_player_Y3,home_player_Y4,home_player_Y5,home_player_Y6,home_player_Y7,home_player_Y8,home_player_Y9,home_player_Y10,home_player_Y11,away_player_Y1,away_player_Y2,away_player_Y3,away_player_Y4,away_player_Y5,away_player_Y6,away_player_Y7,away_player_Y8,away_player_Y9,away_player_Y10,away_player_Y11,home_player_1,home_player_2,home_player_3,home_player_4,home_player_5,home_player_6,home_player_7,home_player_8,home_player_9,home_player_10,home_player_11,away_player_1,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11,goal,shoton,shotoff,foulcommit,card,cross,corner,possession,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,LBD,LBA,PSH,PSD,PSA,WHH,WHD,WHA,SJH,SJD,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
0,1,Belgium,Belgium Jupiler League,2008/2009,1,2008-08-17,492473,9987,9993,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.73,3.4,5.0,1.75,3.35,4.2,1.85,3.2,3.5,1.8,3.3,3.75,,,,1.7,3.3,4.33,1.9,3.3,4.0,1.65,3.4,4.5,1.78,3.25,4.0,1.73,3.4,4.2
1,2,Belgium,Belgium Jupiler League,2008/2009,1,2008-08-16,492474,10000,9994,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.95,3.2,3.6,1.8,3.3,3.95,1.9,3.2,3.5,1.9,3.2,3.5,,,,1.83,3.3,3.6,1.95,3.3,3.8,2.0,3.25,3.25,1.85,3.25,3.75,1.91,3.25,3.6
2,3,Belgium,Belgium Jupiler League,2008/2009,1,2008-08-16,492475,9984,8635,0,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.38,3.3,2.75,2.4,3.3,2.55,2.6,3.1,2.3,2.5,3.2,2.5,,,,2.5,3.25,2.4,2.63,3.3,2.5,2.35,3.25,2.65,2.5,3.2,2.5,2.3,3.2,2.75
3,4,Belgium,Belgium Jupiler League,2008/2009,1,2008-08-17,492476,9991,9998,5,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.44,3.75,7.5,1.4,4.0,6.8,1.4,3.9,6.0,1.44,3.6,6.5,,,,1.44,3.75,6.0,1.44,4.0,7.5,1.45,3.75,6.5,1.5,3.75,5.5,1.44,3.75,6.5
4,5,Belgium,Belgium Jupiler League,2008/2009,1,2008-08-16,492477,7947,9985,1,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,3.5,1.65,5.0,3.5,1.6,4.0,3.3,1.7,4.0,3.4,1.72,,,,4.2,3.4,1.7,4.5,3.5,1.73,4.5,3.4,1.65,4.5,3.5,1.65,4.75,3.3,1.67


### Cleaning the Player Dataframe

In [6]:
missing_values_count_player = player_df.isnull().sum()
print(missing_values_count_player)
duplicateRowsPlayerDF = player_df[player_df.duplicated(['player_api_id', 'player_fifa_api_id'])]
print('Number of duplicates: ' + str(duplicateRowsPlayerDF.shape[0]))
del player_df['id']
player_df.insert(0, 'id', range(1, len(player_df)+1))
player_df

id                    0
player_api_id         0
player_name           0
player_fifa_api_id    0
birthday              0
height                0
weight                0
dtype: int64
Number of duplicates: 0


Unnamed: 0,id,player_api_id,player_name,player_fifa_api_id,birthday,height,weight
0,1,505942,Aaron Appindangoye,218353,1992-02-29 00:00:00,182.88,187
1,2,155782,Aaron Cresswell,189615,1989-12-15 00:00:00,170.18,146
2,3,162549,Aaron Doran,186170,1991-05-13 00:00:00,170.18,163
3,4,30572,Aaron Galindo,140161,1982-05-08 00:00:00,182.88,198
4,5,23780,Aaron Hughes,17725,1979-11-08 00:00:00,182.88,154
5,6,27316,Aaron Hunt,158138,1986-09-04 00:00:00,182.88,161
6,7,564793,Aaron Kuhl,221280,1996-01-30 00:00:00,172.72,146
7,8,30895,Aaron Lennon,152747,1987-04-16 00:00:00,165.10,139
8,9,528212,Aaron Lennox,206592,1993-02-19 00:00:00,190.50,181
9,10,101042,Aaron Meijers,188621,1987-10-28 00:00:00,175.26,170


### Cleaning the Player Attributes Dataframe

In [7]:
playerAttributes_df.head(10)

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,1,218353,505942,2016-02-18 00:00:00,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,2,218353,505942,2015-11-19 00:00:00,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,3,218353,505942,2015-09-21 00:00:00,62.0,66.0,right,medium,medium,49.0,...,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
3,4,218353,505942,2015-03-20 00:00:00,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
4,5,218353,505942,2007-02-22 00:00:00,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
5,6,189615,155782,2016-04-21 00:00:00,74.0,76.0,left,high,medium,80.0,...,66.0,59.0,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0
6,7,189615,155782,2016-04-07 00:00:00,74.0,76.0,left,high,medium,80.0,...,66.0,59.0,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0
7,8,189615,155782,2016-01-07 00:00:00,73.0,75.0,left,high,medium,79.0,...,65.0,59.0,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0
8,9,189615,155782,2015-12-24 00:00:00,73.0,75.0,left,high,medium,79.0,...,65.0,59.0,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0
9,10,189615,155782,2015-12-17 00:00:00,73.0,75.0,left,high,medium,79.0,...,65.0,59.0,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0


In [8]:
print('The number of rows before removing duplicates: ' + str(playerAttributes_df.shape[0]))
duplicateRowsPlayerAttributesDF = playerAttributes_df[playerAttributes_df.duplicated(['player_fifa_api_id', 'player_api_id'])]
duplicatePercentage = (duplicateRowsPlayerAttributesDF.shape[0]/playerAttributes_df.shape[0]) * 100
print('The percentage of duplicated rows: ' + str(duplicatePercentage))

The number of rows before removing duplicates: 183978
The percentage of duplicated rows: 93.98351976866799


In [9]:
playerAttributes_df_1 = playerAttributes_df.sort_values('date').drop_duplicates(['player_fifa_api_id', 'player_api_id'],keep='last')
playerAttributes_df_1 = playerAttributes_df_1.sort_values('id')
print('The number of rows after removing duplicates: ' + str(playerAttributes_df_1.shape[0]))
percentageLeft = (playerAttributes_df_1.shape[0]/playerAttributes_df.shape[0]) * 100
print('The percentage of rows left: ' + str(percentageLeft))
playerAttributes_df_1.head(10)

The number of rows after removing duplicates: 11069
The percentage of rows left: 6.0164802313320065


Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,1,218353,505942,2016-02-18 00:00:00,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
5,6,189615,155782,2016-04-21 00:00:00,74.0,76.0,left,high,medium,80.0,...,66.0,59.0,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0
38,39,186170,162549,2016-01-07 00:00:00,65.0,67.0,right,medium,medium,64.0,...,68.0,61.0,23.0,22.0,24.0,16.0,11.0,12.0,9.0,13.0
64,65,140161,30572,2016-04-21 00:00:00,69.0,69.0,right,medium,medium,57.0,...,54.0,37.0,72.0,71.0,68.0,15.0,12.0,13.0,12.0,11.0
87,88,17725,23780,2015-12-24 00:00:00,70.0,70.0,right,medium,medium,46.0,...,41.0,45.0,75.0,73.0,71.0,8.0,6.0,16.0,12.0,11.0
112,113,158138,27316,2016-04-28 00:00:00,77.0,77.0,left,medium,medium,77.0,...,76.0,81.0,28.0,34.0,35.0,15.0,12.0,7.0,16.0,15.0
139,140,221280,564793,2016-04-21 00:00:00,61.0,74.0,right,medium,high,48.0,...,61.0,42.0,52.0,58.0,57.0,8.0,13.0,14.0,14.0,15.0
146,147,152747,30895,2015-10-16 00:00:00,77.0,77.0,right,high,medium,78.0,...,73.0,62.0,30.0,34.0,35.0,14.0,7.0,7.0,16.0,11.0
172,173,206592,528212,2016-02-25 00:00:00,48.0,56.0,right,medium,medium,12.0,...,15.0,41.0,15.0,15.0,12.0,53.0,41.0,39.0,51.0,53.0
179,180,188621,101042,2015-12-03 00:00:00,69.0,69.0,left,medium,medium,63.0,...,71.0,56.0,67.0,68.0,65.0,7.0,15.0,7.0,10.0,15.0


In [10]:
# get the number of missing values per column
missing_values_count_playerAttributes = playerAttributes_df_1.isnull().sum()

print(missing_values_count_playerAttributes)

total_cells_playerAttributes = np.product(playerAttributes_df_1.shape) 
total_missing_playerAttributes = missing_values_count_playerAttributes.sum()

# percentage of data that is missing
percentage_missign_values_playerAttributes = (total_missing_playerAttributes/total_cells_playerAttributes) * 100
print('Percentage of missing values: '+ str(percentage_missign_values_playerAttributes))

id                       0
player_fifa_api_id       0
player_api_id            0
date                     0
overall_rating           4
potential                4
preferred_foot           4
attacking_work_rate    544
defensive_work_rate      4
crossing                 4
finishing                4
heading_accuracy         4
short_passing            4
volleys                482
dribbling                4
curve                  482
free_kick_accuracy       4
long_passing             4
ball_control             4
acceleration             4
sprint_speed             4
agility                482
reactions                4
balance                482
shot_power               4
jumping                482
stamina                  4
strength                 4
long_shots               4
aggression               4
interceptions            4
positioning              4
vision                 482
penalties                4
marking                  4
standing_tackle          4
sliding_tackle         482
g

In [11]:
rows_with_missing_values = playerAttributes_df_1[playerAttributes_df_1.isna().any(axis=1)]
print('Percentage of rows with missing values: '+ str((rows_with_missing_values.shape[0]/playerAttributes_df_1.shape[0])*100))

Percentage of rows with missing values: 5.908392808745145


In [12]:
# merge with player_df to search on the internet for missing values with player names
df2 = pd.merge(player_df,playerAttributes_df_1,on=['player_api_id','player_fifa_api_id'])
# sort and get the percentage of old players having NaNs
rows_with_null_sorted = df2[df2.isna().any(axis=1)].sort_values('birthday')
end_date = pd.Timestamp(datetime.date(1980,1,1))
rows_with_null_sorted['birthday'] = pd.to_datetime(rows_with_null_sorted['birthday'])
mask = rows_with_null_sorted['birthday'] <= end_date
old_players = rows_with_null_sorted.loc[mask]
old_players

Unnamed: 0,id_x,player_api_id,player_name,player_fifa_api_id,birthday,height,weight,id_y,date,overall_rating,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
289,290,39425,Alberto Fontana,2431,1967-01-23,185.42,161,4840,2009-02-22 00:00:00,77.0,...,,58.0,24.0,27.0,,76.0,79.0,75.0,77.0,77.0
8284,8285,26099,Paolo Maldini,1109,1968-06-26,185.42,187,138695,2009-02-22 00:00:00,77.0,...,,90.0,85.0,82.0,,5.0,21.0,70.0,21.0,21.0
6238,6239,27666,Luca Bucci,12514,1969-03-13,180.34,174,103754,2009-02-22 00:00:00,63.0,...,,33.0,47.0,39.0,,58.0,66.0,69.0,67.0,66.0
2492,2493,23605,Dean Windass,50474,1969-04-01,177.80,183,41818,2010-02-22 00:00:00,66.0,...,,70.0,44.0,38.0,,8.0,20.0,61.0,20.0,20.0
7389,7390,27346,Michael Tarnat,722,1969-10-27,185.42,192,123819,2008-08-30 00:00:00,73.0,...,,87.0,78.0,77.0,,28.0,36.0,80.0,31.0,32.0
4827,4828,30648,Jens Lehmann,805,1969-11-10,190.50,192,80260,2010-02-22 00:00:00,79.0,...,,74.0,21.0,21.0,,77.0,82.0,75.0,88.0,76.0
4122,4123,26003,Hans Vonk,7947,1970-01-30,195.58,183,68729,2009-02-22 00:00:00,69.0,...,,31.0,31.0,22.0,,68.0,69.0,64.0,73.0,70.0
887,888,41881,Antonio Chimenti,4739,1970-06-30,182.88,183,15415,2010-02-22 00:00:00,70.0,...,,50.0,28.0,28.0,,68.0,69.0,70.0,75.0,70.0
3116,3117,27661,Eugenio Corini,5244,1970-07-30,172.72,148,52029,2009-02-22 00:00:00,72.0,...,,70.0,44.0,69.0,,6.0,22.0,73.0,22.0,22.0
8512,8513,11716,Pedro Roma,20490,1970-08-13,185.42,183,142017,2009-02-22 00:00:00,70.0,...,,42.0,23.0,36.0,,70.0,69.0,66.0,66.0,71.0


In [13]:
rows_with_missing_values_old_players = old_players[old_players.isna().any(axis=1)]
print('Percentage of rows of old players with missing values: '+ str((rows_with_missing_values_old_players.shape[0]/playerAttributes_df_1.shape[0])*100))

Percentage of rows of old players with missing values: 2.7915800885355497


In [14]:
playerAttributes_df_1 = playerAttributes_df_1.dropna()
playerAttributes_df_1

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,1,218353,505942,2016-02-18 00:00:00,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
5,6,189615,155782,2016-04-21 00:00:00,74.0,76.0,left,high,medium,80.0,...,66.0,59.0,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0
38,39,186170,162549,2016-01-07 00:00:00,65.0,67.0,right,medium,medium,64.0,...,68.0,61.0,23.0,22.0,24.0,16.0,11.0,12.0,9.0,13.0
64,65,140161,30572,2016-04-21 00:00:00,69.0,69.0,right,medium,medium,57.0,...,54.0,37.0,72.0,71.0,68.0,15.0,12.0,13.0,12.0,11.0
87,88,17725,23780,2015-12-24 00:00:00,70.0,70.0,right,medium,medium,46.0,...,41.0,45.0,75.0,73.0,71.0,8.0,6.0,16.0,12.0,11.0
112,113,158138,27316,2016-04-28 00:00:00,77.0,77.0,left,medium,medium,77.0,...,76.0,81.0,28.0,34.0,35.0,15.0,12.0,7.0,16.0,15.0
139,140,221280,564793,2016-04-21 00:00:00,61.0,74.0,right,medium,high,48.0,...,61.0,42.0,52.0,58.0,57.0,8.0,13.0,14.0,14.0,15.0
146,147,152747,30895,2015-10-16 00:00:00,77.0,77.0,right,high,medium,78.0,...,73.0,62.0,30.0,34.0,35.0,14.0,7.0,7.0,16.0,11.0
172,173,206592,528212,2016-02-25 00:00:00,48.0,56.0,right,medium,medium,12.0,...,15.0,41.0,15.0,15.0,12.0,53.0,41.0,39.0,51.0,53.0
179,180,188621,101042,2015-12-03 00:00:00,69.0,69.0,left,medium,medium,63.0,...,71.0,56.0,67.0,68.0,65.0,7.0,15.0,7.0,10.0,15.0


In [15]:
missing_values_count_playerAttributes = playerAttributes_df_1.isnull().sum()
print(missing_values_count_playerAttributes)

id                     0
player_fifa_api_id     0
player_api_id          0
date                   0
overall_rating         0
potential              0
preferred_foot         0
attacking_work_rate    0
defensive_work_rate    0
crossing               0
finishing              0
heading_accuracy       0
short_passing          0
volleys                0
dribbling              0
curve                  0
free_kick_accuracy     0
long_passing           0
ball_control           0
acceleration           0
sprint_speed           0
agility                0
reactions              0
balance                0
shot_power             0
jumping                0
stamina                0
strength               0
long_shots             0
aggression             0
interceptions          0
positioning            0
vision                 0
penalties              0
marking                0
standing_tackle        0
sliding_tackle         0
gk_diving              0
gk_handling            0
gk_kicking             0


In [16]:
playerAttributes_df_1 = playerAttributes_df_1.rename(columns={"date": "date_modified"})
del playerAttributes_df_1['id']
playerAttributes_df_1.insert(0, 'id', range(1, len(playerAttributes_df_1)+1))
playerAttributes_df_1

Unnamed: 0,id,player_fifa_api_id,player_api_id,date_modified,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,1,218353,505942,2016-02-18 00:00:00,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
5,2,189615,155782,2016-04-21 00:00:00,74.0,76.0,left,high,medium,80.0,...,66.0,59.0,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0
38,3,186170,162549,2016-01-07 00:00:00,65.0,67.0,right,medium,medium,64.0,...,68.0,61.0,23.0,22.0,24.0,16.0,11.0,12.0,9.0,13.0
64,4,140161,30572,2016-04-21 00:00:00,69.0,69.0,right,medium,medium,57.0,...,54.0,37.0,72.0,71.0,68.0,15.0,12.0,13.0,12.0,11.0
87,5,17725,23780,2015-12-24 00:00:00,70.0,70.0,right,medium,medium,46.0,...,41.0,45.0,75.0,73.0,71.0,8.0,6.0,16.0,12.0,11.0
112,6,158138,27316,2016-04-28 00:00:00,77.0,77.0,left,medium,medium,77.0,...,76.0,81.0,28.0,34.0,35.0,15.0,12.0,7.0,16.0,15.0
139,7,221280,564793,2016-04-21 00:00:00,61.0,74.0,right,medium,high,48.0,...,61.0,42.0,52.0,58.0,57.0,8.0,13.0,14.0,14.0,15.0
146,8,152747,30895,2015-10-16 00:00:00,77.0,77.0,right,high,medium,78.0,...,73.0,62.0,30.0,34.0,35.0,14.0,7.0,7.0,16.0,11.0
172,9,206592,528212,2016-02-25 00:00:00,48.0,56.0,right,medium,medium,12.0,...,15.0,41.0,15.0,15.0,12.0,53.0,41.0,39.0,51.0,53.0
179,10,188621,101042,2015-12-03 00:00:00,69.0,69.0,left,medium,medium,63.0,...,71.0,56.0,67.0,68.0,65.0,7.0,15.0,7.0,10.0,15.0


In [17]:
pd.set_option('display.max_rows', 800)
pd.set_option('display.max_columns', 116)

### Cleaning the Home Players X1 to X11 Coordinates (1/2)
Getting the missing X1 to X11 values from the complete records of the home teams in the table

In [64]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            home_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            home_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        home_players_df = home_players_df.dropna(subset = ['home_player_X'+str(m)])
        home_player_X_rows = match_df[match_df['home_player_X'+str(m)].isnull()]
        if n < 10:
            home_player_X_rows = home_player_X_rows[home_player_X_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            home_player_X_rows = home_player_X_rows[home_player_X_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        home_player_X_rows = home_player_X_rows.reset_index()

        for i in range(0, home_player_X_rows.shape[0]):
            return_rows = home_players_df[home_players_df['home_team_api_id'] == home_player_X_rows['home_team_api_id'][i]]
            
            if return_rows.size != 0:
                X_Player = return_rows['home_player_X'+str(m)].head(1)
                null_X_rows = match_df[(match_df['home_team_api_id'] == home_player_X_rows['home_team_api_id'][i])]
                null_X_rows = null_X_rows[null_X_rows['home_player_X'+str(m)].isnull()]
                
                if n < 10:
                    null_X_rows = null_X_rows[null_X_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_X_rows = null_X_rows[null_X_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_X_rows.shape[0]):
                    indices.append(null_X_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'home_player_X'+str(m)] = int(X_Player[X_Player.index[0]])

### Cleaning the Home Players X1 to X11 Coordinates (2/2)
Getting the missing X1 to X11 values from the complete records of the away teams in the table

In [65]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            home_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            home_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        home_players_df = home_players_df.dropna(subset = ['away_player_X'+str(m)])
        home_player_X_rows = match_df[match_df['home_player_X'+str(m)].isnull()]
        if n < 10:
            home_player_X_rows = home_player_X_rows[home_player_X_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            home_player_X_rows = home_player_X_rows[home_player_X_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        home_player_X_rows = home_player_X_rows.reset_index()

        for i in range(0, home_player_X_rows.shape[0]):
            return_rows = home_players_df[home_players_df['away_team_api_id'] == home_player_X_rows['home_team_api_id'][i]]
            
            if return_rows.size != 0:
                X_Player = return_rows['away_player_X'+str(m)].head(1)
                null_X_rows = match_df[(match_df['away_team_api_id'] == home_player_X_rows['home_team_api_id'][i])]
                null_X_rows = null_X_rows[null_X_rows['home_player_X'+str(m)].isnull()]
                
                if n < 10:
                    null_X_rows = null_X_rows[null_X_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_X_rows = null_X_rows[null_X_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_X_rows.shape[0]):
                    indices.append(null_X_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'home_player_X'+str(m)] = int(X_Player[X_Player.index[0]])

### Cleaning the Home Players Y1 to Y11 Coordinates (1/2)
Getting the missing Y1 to Y11 values from the complete records of the home teams in the table

In [66]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            home_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            home_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        home_players_df = home_players_df.dropna(subset = ['home_player_Y'+str(m)])
        home_player_Y_rows = match_df[match_df['home_player_Y'+str(m)].isnull()]
        if n < 10:
            home_player_Y_rows = home_player_Y_rows[home_player_Y_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            home_player_Y_rows = home_player_Y_rows[home_player_Y_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        home_player_Y_rows = home_player_Y_rows.reset_index()

        for i in range(0, home_player_Y_rows.shape[0]):
            return_rows = home_players_df[home_players_df['home_team_api_id'] == home_player_Y_rows['home_team_api_id'][i]]
            
            if return_rows.size != 0:
                Y_Player = return_rows['home_player_Y'+str(m)].head(1)
                null_Y_rows = match_df[(match_df['home_team_api_id'] == home_player_Y_rows['home_team_api_id'][i])]
                null_Y_rows = null_Y_rows[null_Y_rows['home_player_Y'+str(m)].isnull()]
                
                if n < 10:
                    null_Y_rows = null_Y_rows[null_Y_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_Y_rows = null_Y_rows[null_Y_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_Y_rows.shape[0]):
                    indices.append(null_Y_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'home_player_Y'+str(m)] = int(Y_Player[Y_Player.index[0]])

### Cleaning the Home Players Y1 to Y11 Coordinates (2/2)
Getting the missing Y1 to Y11 values from the complete records of the away teams in the table

In [67]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            home_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            home_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        home_players_df = home_players_df.dropna(subset = ['away_player_Y'+str(m)])
        home_player_Y_rows = match_df[match_df['home_player_Y'+str(m)].isnull()]
        if n < 10:
            home_player_Y_rows = home_player_Y_rows[home_player_Y_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            home_player_Y_rows = home_player_Y_rows[home_player_Y_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        home_player_Y_rows = home_player_Y_rows.reset_index()

        for i in range(0, home_player_Y_rows.shape[0]):
            return_rows = home_players_df[home_players_df['away_team_api_id'] == home_player_Y_rows['home_team_api_id'][i]]
            
            if return_rows.size != 0:
                Y_Player = return_rows['away_player_Y'+str(m)].head(1)
                null_Y_rows = match_df[(match_df['away_team_api_id'] == home_player_Y_rows['home_team_api_id'][i])]
                null_Y_rows = null_Y_rows[null_Y_rows['home_player_Y'+str(m)].isnull()]
                
                if n < 10:
                    null_Y_rows = null_Y_rows[null_Y_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_Y_rows = null_Y_rows[null_Y_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_Y_rows.shape[0]):
                    indices.append(null_Y_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'home_player_Y'+str(m)] = int(Y_Player[Y_Player.index[0]])

### Cleaning the Away Players X1 to X11 Coordinates (1/2)
Getting the missing X1 to X11 values from the complete records of the away teams in the table

In [68]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            away_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            away_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        away_players_df = away_players_df.dropna(subset = ['away_player_X'+str(m)])

        away_player_X_rows = match_df[match_df['away_player_X'+str(m)].isnull()]
        if n < 10:
            away_player_X_rows = away_player_X_rows[away_player_X_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            away_player_X_rows = away_player_X_rows[away_player_X_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        away_player_X_rows = away_player_X_rows.reset_index()

        for i in range(0, away_player_X_rows.shape[0]):
            return_rows = away_players_df[away_players_df['away_team_api_id'] == away_player_X_rows['away_team_api_id'][i]]
            
            if return_rows.size != 0:
                X_Player = return_rows['away_player_X'+str(m)].head(1)
                null_X_rows = match_df[(match_df['away_team_api_id'] == away_player_X_rows['away_team_api_id'][i])]
                null_X_rows = null_X_rows[null_X_rows['away_player_X'+str(m)].isnull()]
                
                if n < 10:
                    null_X_rows = null_X_rows[null_X_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_X_rows = null_X_rows[null_X_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_X_rows.shape[0]):
                    indices.append(null_X_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'away_player_X'+str(m)] = int(X_Player[X_Player.index[0]])

### Cleaning the Away Players X1 to X11 Coordinates (2/2)
Getting the missing X1 to X11 values from the complete records of the home teams in the table

In [69]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            away_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            away_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        away_players_df = away_players_df.dropna(subset = ['home_player_X'+str(m)])
        away_player_X_rows = match_df[match_df['away_player_X'+str(m)].isnull()]
        if n < 10:
            away_player_X_rows = away_player_X_rows[away_player_X_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            away_player_X_rows = away_player_X_rows[away_player_X_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        away_player_X_rows = away_player_X_rows.reset_index()

        for i in range(0, away_player_X_rows.shape[0]):
            return_rows = away_players_df[away_players_df['home_team_api_id'] == away_player_X_rows['away_team_api_id'][i]]
            
            if return_rows.size != 0:
                X_Player = return_rows['home_player_X'+str(m)].head(1)
                null_X_rows = match_df[(match_df['home_team_api_id'] == away_player_X_rows['away_team_api_id'][i])]
                null_X_rows = null_X_rows[null_X_rows['away_player_X'+str(m)].isnull()]
                
                if n < 10:
                    null_X_rows = null_X_rows[null_X_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_X_rows = null_X_rows[null_X_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_X_rows.shape[0]):
                    indices.append(null_X_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'away_player_X'+str(m)] = int(X_Player[X_Player.index[0]])

### Cleaning the Away Players Y1 to Y11 Coordinates (1/2)
Getting the missing Y1 to Y11 values from the complete records of the away teams in the table

In [70]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            away_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            away_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        away_players_df = away_players_df.dropna(subset = ['away_player_Y'+str(m)])

        away_player_Y_rows = match_df[match_df['away_player_Y'+str(m)].isnull()]
        if n < 10:
            away_player_Y_rows = away_player_Y_rows[away_player_Y_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            away_player_Y_rows = away_player_Y_rows[away_player_Y_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        away_player_Y_rows = away_player_Y_rows.reset_index()

        for i in range(0, away_player_Y_rows.shape[0]):
            return_rows = away_players_df[away_players_df['away_team_api_id'] == away_player_Y_rows['away_team_api_id'][i]]
            
            if return_rows.size != 0:
                Y_Player = return_rows['away_player_Y'+str(m)].head(1)
                null_Y_rows = match_df[(match_df['away_team_api_id'] == away_player_Y_rows['away_team_api_id'][i])]
                null_Y_rows = null_Y_rows[null_Y_rows['away_player_Y'+str(m)].isnull()]
                
                if n < 10:
                    null_Y_rows = null_Y_rows[null_Y_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_Y_rows = null_Y_rows[null_Y_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_Y_rows.shape[0]):
                    indices.append(null_Y_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'away_player_Y'+str(m)] = int(Y_Player[Y_Player.index[0]])

### Cleaning the Away Players Y1 to Y11 Coordinates (2/2)
Getting the missing Y1 to Y11 values from the complete records of the home teams in the table

In [71]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            away_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            away_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        away_players_df = away_players_df.dropna(subset = ['home_player_Y'+str(m)])
        away_player_Y_rows = match_df[match_df['away_player_Y'+str(m)].isnull()]
        if n < 10:
            away_player_Y_rows = away_player_Y_rows[away_player_Y_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            away_player_Y_rows = away_player_Y_rows[away_player_Y_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        away_player_Y_rows = away_player_Y_rows.reset_index()

        for i in range(0, away_player_Y_rows.shape[0]):
            return_rows = away_players_df[away_players_df['home_team_api_id'] == away_player_Y_rows['away_team_api_id'][i]]
            
            if return_rows.size != 0:
                Y_Player = return_rows['home_player_Y'+str(m)].head(1)
                null_Y_rows = match_df[(match_df['home_team_api_id'] == away_player_Y_rows['away_team_api_id'][i])]
                null_Y_rows = null_Y_rows[null_Y_rows['away_player_Y'+str(m)].isnull()]
                
                if n < 10:
                    null_Y_rows = null_Y_rows[null_Y_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_Y_rows = null_Y_rows[null_Y_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_Y_rows.shape[0]):
                    indices.append(null_Y_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'away_player_Y'+str(m)] = int(Y_Player[Y_Player.index[0]])

### Cleaning the Home Players Columns (1/2)
Getting the missing home player 1 to 11 values from the complete records of the home teams in the table

In [72]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            home_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            home_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        home_players_df = home_players_df.dropna(subset = ['home_player_'+str(m)])

        home_player_rows = match_df[match_df['home_player_'+str(m)].isnull()]
        if n < 10:
            home_player_rows = home_player_rows[home_player_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            home_player_rows = home_player_rows[home_player_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        home_player_rows = home_player_rows.reset_index()

        for i in range(0, home_player_rows.shape[0]):
            return_rows = home_players_df[home_players_df['home_team_api_id'] == home_player_rows['home_team_api_id'][i]]
            
            if return_rows.size != 0:
                Player = return_rows['home_player_'+str(m)].head(1)
                null_rows = match_df[(match_df['home_team_api_id'] == home_player_rows['home_team_api_id'][i])]
                null_rows = null_rows[null_rows['home_player_'+str(m)].isnull()]
                
                if n < 10:
                    null_rows = null_rows[null_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_rows = null_rows[null_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_rows.shape[0]):
                    indices.append(null_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'home_player_'+str(m)] = int(Player[Player.index[0]])

### Cleaning the Home Players Columns (2/2)
Getting the missing home player 1 to 11 values from the complete records of the away teams in the table

In [73]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            home_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            home_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        home_players_df = home_players_df.dropna(subset = ['away_player_'+str(m)])

        home_player_rows = match_df[match_df['home_player_'+str(m)].isnull()]
        if n < 10:
            home_player_rows = home_player_rows[home_player_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            home_player_rows = home_player_rows[home_player_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        home_player_rows = home_player_rows.reset_index()

        for i in range(0, home_player_rows.shape[0]):
            return_rows = home_players_df[home_players_df['away_team_api_id'] == home_player_rows['home_team_api_id'][i]]
            
            if return_rows.size != 0:
                Player = return_rows['away_player_'+str(m)].head(1)
                null_rows = match_df[(match_df['away_team_api_id'] == home_player_rows['home_team_api_id'][i])]
                null_rows = null_rows[null_rows['home_player_'+str(m)].isnull()]
                
                if n < 10:
                    null_rows = null_rows[null_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_rows = null_rows[null_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_rows.shape[0]):
                    indices.append(null_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'home_player_'+str(m)] = int(Player[Player.index[0]])

### Cleaning the Away Players Columns (1/2)
Getting the missing away player 1 to 11 values from the complete records of the away teams in the table

In [74]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            away_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            away_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        away_players_df = away_players_df.dropna(subset = ['away_player_'+str(m)])

        away_player_rows = match_df[match_df['away_player_'+str(m)].isnull()]
        if n < 10:
            away_player_rows = away_player_rows[away_player_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            away_player_rows = away_player_rows[away_player_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        away_player_rows = away_player_rows.reset_index()

        for i in range(0, away_player_rows.shape[0]):
            return_rows = away_players_df[away_players_df['away_team_api_id'] == away_player_rows['away_team_api_id'][i]]
            
            if return_rows.size != 0:
                Player = return_rows['away_player_'+str(m)].head(1)
                null_rows = match_df[(match_df['away_team_api_id'] == away_player_rows['away_team_api_id'][i])]
                null_rows = null_rows[null_rows['away_player_'+str(m)].isnull()]
                
                if n < 10:
                    null_rows = null_rows[null_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_rows = null_rows[null_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_rows.shape[0]):
                    indices.append(null_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'away_player_'+str(m)] = int(Player[Player.index[0]])

### Cleaning the Away Players Columns (2/2)
Getting the missing away player 1 to 11 values from the complete records of the home teams in the table

In [75]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            away_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            away_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        away_players_df = away_players_df.dropna(subset = ['home_player_'+str(m)])

        away_player_rows = match_df[match_df['away_player_'+str(m)].isnull()]
        if n < 10:
            away_player_rows = away_player_rows[away_player_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            away_player_rows = away_player_rows[away_player_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        away_player_rows = away_player_rows.reset_index()

        for i in range(0, away_player_rows.shape[0]):
            return_rows = away_players_df[away_players_df['home_team_api_id'] == away_player_rows['away_team_api_id'][i]]
            
            if return_rows.size != 0:
                Player = return_rows['home_player_'+str(m)].head(1)
                null_rows = match_df[(match_df['home_team_api_id'] == away_player_rows['away_team_api_id'][i])]
                null_rows = null_rows[null_rows['away_player_'+str(m)].isnull()]
                
                if n < 10:
                    null_rows = null_rows[null_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_rows = null_rows[null_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_rows.shape[0]):
                    indices.append(null_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'away_player_'+str(m)] = int(Player[Player.index[0]])

In [76]:
match_df.isnull().sum()

Match_id                0
Country_Name            0
League_Name             0
season                  0
stage                   0
date                    0
match_api_id            0
home_team_api_id        0
away_team_api_id        0
home_team_goal          0
away_team_goal          0
home_player_X1       1200
home_player_X2       1200
home_player_X3       1200
home_player_X4       1200
home_player_X5       1200
home_player_X6       1200
home_player_X7       1200
home_player_X8       1200
home_player_X9       1200
home_player_X10      1200
home_player_X11      1200
away_player_X1       1200
away_player_X2       1200
away_player_X3       1200
away_player_X4       1200
away_player_X5       1200
away_player_X6       1200
away_player_X7       1200
away_player_X8       1200
away_player_X9       1200
away_player_X10      1200
away_player_X11      1200
home_player_Y1       1200
home_player_Y2       1200
home_player_Y3       1200
home_player_Y4       1200
home_player_Y5       1200
home_player_

### Dropping the Null Values From Matche Dataframe

In [88]:
match_df.drop(['goal', 'shoton', 'shotoff', 'foulcommit', 'card', 'cross', 'corner', 'possession', 'B365H', 'B365D',
               'B365A', 'BWH', 'BWD', 'BWA','IWH', 'IWD', 'IWA', 'LBH', 'LBD', 'LBA','PSH', 'PSD', 'PSA','WHH', 'WHD', 'WHA',
               'SJH', 'SJD','SJA', 'VCH','VCD','VCA','GBH','GBD','GBA','BSH','BSD','BSA'], axis=1, inplace=True)

KeyError: "['goal' 'shoton' 'shotoff' 'foulcommit' 'card' 'cross' 'corner'\n 'possession' 'B365H' 'B365D' 'B365A' 'BWH' 'BWD' 'BWA' 'IWH' 'IWD' 'IWA'\n 'LBH' 'LBD' 'LBA' 'PSH' 'PSD' 'PSA' 'WHH' 'WHD' 'WHA' 'SJH' 'SJD' 'SJA'\n 'VCH' 'VCD' 'VCA' 'GBH' 'GBD' 'GBA' 'BSH' 'BSD' 'BSA'] not found in axis"

In [84]:
match_df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)

In [90]:
match_df.isnull().sum()

Match_id            0
Country_Name        0
League_Name         0
season              0
stage               0
date                0
match_api_id        0
home_team_api_id    0
away_team_api_id    0
home_team_goal      0
away_team_goal      0
home_player_X1      0
home_player_X2      0
home_player_X3      0
home_player_X4      0
home_player_X5      0
home_player_X6      0
home_player_X7      0
home_player_X8      0
home_player_X9      0
home_player_X10     0
home_player_X11     0
away_player_X1      0
away_player_X2      0
away_player_X3      0
away_player_X4      0
away_player_X5      0
away_player_X6      0
away_player_X7      0
away_player_X8      0
away_player_X9      0
away_player_X10     0
away_player_X11     0
home_player_Y1      0
home_player_Y2      0
home_player_Y3      0
home_player_Y4      0
home_player_Y5      0
home_player_Y6      0
home_player_Y7      0
home_player_Y8      0
home_player_Y9      0
home_player_Y10     0
home_player_Y11     0
away_player_Y1      0
away_playe

### Cleaning the Team Dataframe

In [None]:
missing_values_count_team = team_df.isnull().sum()
print(missing_values_count_team)

In [None]:
team_api_id_1=team_df[['team_api_id','team_fifa_api_id']]
team_api_id_2=teamAttributes_df[['team_api_id','team_fifa_api_id']]
team_api_id_2.drop_duplicates(subset ="team_fifa_api_id", keep = "last", inplace = True)
missing_api_1=team_api_id_1[team_api_id_1["team_fifa_api_id"].isnull()]
found_api_2= team_api_id_2.loc[team_api_id_2['team_api_id'].isin(missing_api_1)]
found_api_2


In [None]:
team_df.dropna(inplace=True)
missing_values_count_team = team_df.isnull().sum()
print(missing_values_count_team)


In [None]:
duplicateRowsTeamDF = team_df[team_df.duplicated(['team_long_name'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_long_name: ' + str(duplicateNumber))

duplicateRowsTeamDF = team_df[team_df.duplicated(['team_fifa_api_id'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_fifa_api_id: ' + str(duplicateNumber))

duplicateRowsTeamDF = team_df[team_df.duplicated(['team_api_id'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_api_id: ' + str(duplicateNumber))

In [None]:
duplicate_rows_df= team_df[team_df.duplicated(['team_fifa_api_id', 'team_long_name'], keep=False)]
display(duplicate_rows_df)
target_team_api_id=duplicate_rows_df['team_fifa_api_id'].tolist()
teamAttributes_subset=teamAttributes_df.loc[teamAttributes_df['team_fifa_api_id'].isin(target_team_api_id)]
display(teamAttributes_subset.sort_values(['team_fifa_api_id','team_api_id']))

In [None]:
team_df=team_df.sort_values(['team_fifa_api_id','team_api_id'])
team_df.drop_duplicates(subset =['team_fifa_api_id', 'team_long_name'], keep = 'last', inplace = True)

duplicateRowsTeamDF = team_df[team_df.duplicated(['team_long_name'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_long_name: ' + str(duplicateNumber))

duplicateRowsTeamDF = team_df[team_df.duplicated(['team_fifa_api_id'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_fifa_api_id: ' + str(duplicateNumber))

In [None]:
duplicateRowsTeamDF = team_df[team_df.duplicated(['team_long_name'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_long_name: ' + str(duplicateNumber))

duplicateRowsTeamDF = team_df[team_df.duplicated(['team_fifa_api_id'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_fifa_api_id: ' + str(duplicateNumber))

duplicateRowsTeamDF = team_df[team_df.duplicated(['team_api_id'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_api_id: ' + str(duplicateNumber))

print('The number of missing values in each column:')
team_df.dropna(inplace=True)
missing_values_count_team= team_df.isnull().sum()
print(missing_values_count_team)

In [None]:
del team_df['id']
team_df.insert(0, 'team_id', range(1, len(team_df)+1))
team_df

### Cleaning The Team Attributes Dataframe

In [None]:
duplicateRowsTeamDF = teamAttributes_df[teamAttributes_df.duplicated(['team_api_id'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_api_id: ' + str(duplicateNumber))

duplicateRowsTeamDF = teamAttributes_df[teamAttributes_df.duplicated(['team_fifa_api_id'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_fifa_api_id: ' + str(duplicateNumber))

In [None]:
duplicated_api= teamAttributes_df[teamAttributes_df.duplicated(['team_api_id'], keep=False)]
duplicated_fifa_api= teamAttributes_df[teamAttributes_df.duplicated(['team_fifa_api_id'], keep=False)]
differ= duplicated_fifa_api[~duplicated_fifa_api['team_api_id'].isin(duplicated_api['team_api_id'])]
differ

In [None]:
teamAttributes_df=teamAttributes_df.sort_values(['team_fifa_api_id','team_api_id','date'])
teamAttributes_df.drop_duplicates(subset =['team_fifa_api_id', 'team_api_id'], keep = 'last', inplace = True)

duplicateRowsTeamDF = teamAttributes_df[teamAttributes_df.duplicated(['team_api_id'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_api_id: ' + str(duplicateNumber))

duplicateRowsTeamDF = teamAttributes_df[teamAttributes_df.duplicated(['team_fifa_api_id'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_fifa_api_id: ' + str(duplicateNumber))

In [None]:
duplicated_fifa_api= teamAttributes_df[teamAttributes_df.duplicated(['team_fifa_api_id'], keep=False)]
duplicated_fifa_api

In [None]:
teamAttributes_df=teamAttributes_df.sort_values(['team_fifa_api_id','team_api_id','date'])
teamAttributes_df.drop_duplicates(subset =['team_fifa_api_id'], keep = 'last', inplace = True)

duplicateRowsTeamDF = teamAttributes_df[teamAttributes_df.duplicated(['team_fifa_api_id'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_fifa_api_id: ' + str(duplicateNumber))

In [None]:
print('The number of missing values in each column:')
missing_values_count_team_attributes = teamAttributes_df.isnull().sum()
print(missing_values_count_team_attributes)

In [None]:
subset=teamAttributes_df.loc[:,['buildUpPlayDribblingClass','buildUpPlayDribbling']]
display(subset[subset.isnull().any(axis=1)])
display(subset[subset.isnull().any(axis=1)].count())
display(subset.loc[subset['buildUpPlayDribblingClass'] == 'Little'])
subset_little= subset.loc[subset['buildUpPlayDribblingClass'] == 'Little']
subset_little.dropna(inplace=True)
little_mean=round(subset_little["buildUpPlayDribbling"].mean())
print('mean value that is supposed to be filled:', little_mean)
teamAttributes_df["buildUpPlayDribbling"].fillna(little_mean, inplace = True)

In [None]:
duplicateRowsTeamDF = teamAttributes_df[teamAttributes_df.duplicated(['team_api_id'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_api_id: ' + str(duplicateNumber))

duplicateRowsTeamDF = teamAttributes_df[teamAttributes_df.duplicated(['team_fifa_api_id'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_fifa_api_id: ' + str(duplicateNumber))

print('The number of missing values in each column:')
missing_values_count_team_attributes = teamAttributes_df.isnull().sum()
print(missing_values_count_team_attributes)

teamAttributes_df

In [None]:
del teamAttributes_df['id']
teamAttributes_df= teamAttributes_df.sort_values(['team_fifa_api_id'])
teamAttributes_df.insert(0, 'team_attributes_id', range(1, len(teamAttributes_df)+1))
teamAttributes_df

In [None]:
teams1=team_df.loc[:,'team_fifa_api_id']
teams2=teamAttributes_df.loc[:,'team_fifa_api_id']
differ1= teams1[~teams1.isin(teams2)]
print('number of teams present in team_df and not present in teamAttributes_df= ', differ1.count())
differ2= teams2[~teams2.isin(teams1)]
print('number of teams present in teamAttributes_df and not present in team_df= ', differ2.count())